diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 17d3e6ab7c65ab..95d0ad0f9dc96a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -345,15 +345,6 @@ extern char &AMDGPUPrintfRuntimeBindingID;
 void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &);
 extern char &AMDGPUResourceUsageAnalysisID;
 
-struct AMDGPUExpandPseudoIntrinsicsPass
-    : PassInfoMixin<AMDGPUExpandPseudoIntrinsicsPass> {
-  const AMDGPUTargetMachine &TM;
-  AMDGPUExpandPseudoIntrinsicsPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {}
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-
-  static bool isRequired() { return true; }
-};
-
 struct AMDGPUPrintfRuntimeBindingPass
     : PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
deleted file mode 100644
index bf0ec39ab6c6e7..00000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This file implements a pass that deals with expanding AMDGCN generic pseudo-
-// intrinsics into target specific quantities / sequences. In this context, a
-// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a
-// specific instruction, but rather is intended as a mechanism for abstractly
-// conveying target specific info to a HLL / the FE, without concretely
-// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize.
-// This pass should run as early as possible / immediately after Clang CodeGen,
-// so that the optimisation pipeline and the BE operate with concrete target
-// data.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUTargetMachine.h"
-#include "GCNSubtarget.h"
-
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-
-using namespace llvm;
-
-static inline PreservedAnalyses expandWaveSizeIntrinsic(const GCNSubtarget &ST,
-                                                        Function *WaveSize) {
-  if (WaveSize->hasZeroLiveUses())
-    return PreservedAnalyses::all();
-
-  for (auto &&U : WaveSize->users())
-    U->replaceAllUsesWith(
-        ConstantInt::get(WaveSize->getReturnType(), ST.getWavefrontSize()));
-
-  return PreservedAnalyses::none();
-}
-
-PreservedAnalyses
-AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &) {
-  if (M.empty())
-    return PreservedAnalyses::all();
-
-  const auto &ST = TM.getSubtarget<GCNSubtarget>(*M.begin());
-
-  // This is not a concrete target, we should not fold early.
-  if (ST.getCPU().empty() || ST.getCPU() == "generic")
-    return PreservedAnalyses::all();
-
-  if (auto WS = Intrinsic::getDeclarationIfExists(
-          &M, Intrinsic::amdgcn_wavefrontsize))
-    return expandWaveSizeIntrinsic(ST, WS);
-
-  return PreservedAnalyses::all();
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 8beb9defee66a0..0b2548af72fc0d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1024,6 +1024,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     }
     break;
   }
+  case Intrinsic::amdgcn_wavefrontsize: {
+    // TODO: this is a workaround for the pseudo-generic target one gets with no
+    // specified mcpu, which spoofs its wave size to 64; it should be removed.
+    if ((ST->getCPU().empty() || ST->getCPU() == "generic") &&
+        !ST->getFeatureString().contains("+wavefrontsize"))
+      break;
+    return IC.replaceInstUsesWith(
+        II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
+  }
   case Intrinsic::amdgcn_wqm_vote: {
     // wqm_vote is identity when the argument is constant.
     if (!isa<Constant>(II.getArgOperand(0)))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 323c195c329168..174a90f0aa419d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -27,8 +27,6 @@ MODULE_PASS("amdgpu-perf-hint",
               *static_cast<const GCNTargetMachine *>(this)))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
-MODULE_PASS("amdgpu-expand-pseudo-intrinsics",
-            AMDGPUExpandPseudoIntrinsicsPass(*this))
 #undef MODULE_PASS
 
 #ifndef MODULE_PASS_WITH_PARAMS
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c9d4452b4a035c..fed29c3e14aae2 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -54,7 +54,6 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp
-  AMDGPUExpandPseudoIntrinsics.cpp
   AMDGPUExportClustering.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUGlobalISelDivergenceLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
index 2d060fd4305077..33dd2bd540ad06 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -1,21 +1,9 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 
-; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT-W64 %s
-
 ; GCN-LABEL: {{^}}fold_wavefrontsize:
 
 ; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 32
@@ -24,25 +12,7 @@
 
 
 define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
-; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
-; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; OPT-NEXT:  [[BB:.*:]]
-; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2:[0-9]+]]
-; OPT-NEXT:    store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
-; OPT-NEXT:    ret void
-;
-; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
-; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; OPT-W64-NEXT:  [[BB:.*:]]
-; OPT-W64-NEXT:    store i32 64, ptr addrspace(1) [[ARG]], align 4
-; OPT-W64-NEXT:    ret void
-;
-; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
-; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; OPT-W32-NEXT:  [[BB:.*:]]
-; OPT-W32-NEXT:    store i32 32, ptr addrspace(1) [[ARG]], align 4
-; OPT-W32-NEXT:    ret void
-;
+
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   store i32 %tmp, ptr addrspace(1) %arg, align 4
@@ -58,27 +28,6 @@ bb:
 
 
 define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
-; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-NEXT:  [[BB:.*:]]
-; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
-; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
-; OPT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
-; OPT-NEXT:    store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
-; OPT-NEXT:    ret void
-;
-; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
-; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-W64-NEXT:  [[BB:.*:]]
-; OPT-W64-NEXT:    store i32 2, ptr addrspace(1) [[ARG]], align 4
-; OPT-W64-NEXT:    ret void
-;
-; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
-; OPT-W32-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-W32-NEXT:  [[BB:.*:]]
-; OPT-W32-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
-; OPT-W32-NEXT:    ret void
-;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   %tmp1 = icmp ugt i32 %tmp, 32
@@ -90,29 +39,6 @@ bb:
 ; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
 
 define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-; OPT-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-NEXT:  [[BB:.*:]]
-; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR2]]
-; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
-; OPT-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
-; OPT:       [[BB2]]:
-; OPT-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
-; OPT-NEXT:    br label %[[BB3]]
-; OPT:       [[BB3]]:
-; OPT-NEXT:    ret void
-;
-; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-; OPT-W64-SAME: ptr addrspace(1) nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; OPT-W64-NEXT:  [[BB:.*:]]
-; OPT-W64-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
-; OPT-W64-NEXT:    ret void
-;
-; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-; OPT-W32-SAME: ptr addrspace(1) nocapture readnone [[ARG:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-; OPT-W32-NEXT:  [[BB:.*:]]
-; OPT-W32-NEXT:    ret void
-;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   %tmp1 = icmp ugt i32 %tmp, 32
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll
new file mode 100644
index 00000000000000..d9c105f753e264
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-- -passes=instcombine -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+
+define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1:[0-9]+]]
+; OPT-NEXT:    store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    store i32 32, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    store i32 64, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  store i32 %tmp, ptr addrspace(1) %arg, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]]
+; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
+; OPT-NEXT:    store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    store i32 2, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  %tmp1 = icmp ugt i32 %tmp, 32
+  %tmp2 = select i1 %tmp1, i32 2, i32 1
+  store i32 %tmp2, ptr addrspace(1) %arg
+  ret void
+}
+
+define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]]
+; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT:       [[BB2]]:
+; OPT-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    br label %[[BB3]]
+; OPT:       [[BB3]]:
+; OPT-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    br i1 false, label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT-W32:       [[BB2]]:
+; OPT-W32-NEXT:    br label %[[BB3]]
+; OPT-W32:       [[BB3]]:
+; OPT-W32-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    br i1 true, label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT-W64:       [[BB2]]:
+; OPT-W64-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    br label %[[BB3]]
+; OPT-W64:       [[BB3]]:
+; OPT-W64-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  %tmp1 = icmp ugt i32 %tmp, 32
+  br i1 %tmp1, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  store i32 1, ptr addrspace(1) %arg, align 4
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  ret void
+}
+
+declare i32 @llvm.amdgcn.wavefrontsize() #0
+
+attributes #0 = { nounwind readnone speculatable }