Fix typo under torch/_inductor directory (pytorch#110530)

This PR fixes typo of comments and messages in files under `torch/_dynamo` directory. Pull Request resolved: pytorch#110530 Approved by: https://github.com/kit1980
kurtamohler · Oct 5, 2023 · 434a996 · 434a996
1 parent 9648df1
commit 434a996
Show file tree

Hide file tree

Showing 18 changed files with 31 additions and 31 deletions.
diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
@@ -313,7 +313,7 @@ def benchmark(
 
         results = {}
 
-        # Use a ThreadExecutorPool to spread the work across the subproccesses and
+        # Use a ThreadExecutorPool to spread the work across the subprocesses and
         # to grab subprocesses as soon as they're free.
         for choice, result in zip(choices, self.executor.map(self.target, choices)):
             results[choice] = result

diff --git a/torch/_inductor/codegen/cuda/gemm_template.py b/torch/_inductor/codegen/cuda/gemm_template.py
@@ -21,7 +21,7 @@
 {{template.globals().getvalue()}}
 {{instance_definition}}
 // When workspace_size is not a nullptr, populates requested workspace_size and returns.
-// Otherwise, compuates the Gemm kernel using the given workspace ptr.
+// Otherwise, computes the Gemm kernel using the given workspace ptr.
 extern "C" {
 {{kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=input_reorder)}} {
   try {

diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -1271,7 +1271,7 @@ def __call__(self):
                     return None
                 elif assert_min and assert_max:
                     # The conditions need to be in parens because of Python's operator precedence.
-                    # It'd be less error-prone to use and/or/not, which is suported by triton
+                    # It'd be less error-prone to use and/or/not, which is supported by triton
                     cond = f"(0 <= {self.var}) & ({self.var} < {size_str})"
                     cond_print = f"0 <= {self.var} < {size_str}"
                 elif assert_min:
@@ -1918,7 +1918,7 @@ def codegen_kernel(self, name=None):
         for numel in self.numels:
             numel_hint = V.graph.sizevars.symbolic_hint(numel)
             if not isinstance(numel_hint, (int, sympy.Integer)):
-                # This default heuristic hint was picked carefuly: it is
+                # This default heuristic hint was picked carefully: it is
                 # large, to ensure that we don't shrink the block size (since
                 # if you don't have many elements, it'd be wasteful to pick a
                 # large block size).  Since we don't know how many elements we
@@ -2184,7 +2184,7 @@ def warn_mix_layout(self, kernel_name):
         for arg_name in call_args:
             buf = V.graph.get_buffer(arg_name)
             if buf and len(buf.layout.size) == 4:
-                # ignore the tensor if only 1 dimention is non-zero
+                # ignore the tensor if only 1 dimension is non-zero
                 if len([x for x in buf.layout.size if x == 1]) == 3:
                     continue
                 stride_order = ir.get_stride_order(buf.layout.stride)
@@ -2519,7 +2519,7 @@ def codegen_comment(self, node_schedule):
             if not any(
                 isinstance(n, ForeachKernelSchedulerNode) for n in node_schedule
             ):
-                # We probablly should look what are the nodes inside a foreach
+                # We probably should look what are the nodes inside a foreach
                 # schedule node
                 node_names = [
                     n.get_name()

diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
@@ -1116,7 +1116,7 @@ def write_wrapper_decl(self):
                 f"""std::vector<at::Tensor> {self.call_func_name}(const std::vector<at::Tensor>& inputs) {{"""
             )
         with self.prefix.indent():
-            # assign inputs and outpus in both cases so the later codegen can be simplified
+            # assign inputs and outputs in both cases so the later codegen can be simplified
             if V.graph.aot_mode:
                 if config.aot_inductor.abi_compatible:
                     self.prefix.splice(

diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
@@ -297,7 +297,7 @@ def compile_fx_inner(
     """
     Inductor API that compiles a single graph.
 
-    If you change the argument list for this funtion, make sure you
+    If you change the argument list for this function, make sure you
     also update the call to save_args_for_compile_fx_inner below accordingly.
     """
     if dynamo_utils.count_calls(gm.graph) == 0:
@@ -361,7 +361,7 @@ def compile_fx_inner(
 
         # doesnt work for non-trees because the warmup run would apply mutation twice
         if config.triton.cudagraph_trees:
-            # checking if mutation is only on paramameters/static inputs
+            # checking if mutation is only on parameters/static inputs
             has_mutation = not all(
                 idx < num_fixed for idx in compiled_graph.mutated_input_idxs
             )
@@ -1076,9 +1076,9 @@ def fw_compiler_base(
             # For training
             #   len(orig_model_outputs) <= len(model_outputs)
             # During training, most of the time the model_outputs starts with
-            # orignal module's outputs followed by saved activations.
+            # original module's outputs followed by saved activations.
             # But this can be not true if the model have inplace updated tensors.
-            # AOTAutograd will make those tensors being returned before the orignal
+            # AOTAutograd will make those tensors being returned before the original
             # module's output.
             # To make things safe, we'll use original_output_start_index field
             # set by AOTAutograd to decide where the original module outputs start.

diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -48,7 +48,7 @@
 # enable pattern match+replace optimizations
 pattern_matcher = True
 
-# register custom graph optimizatin pass hook. so far, pre/post passes are
+# register custom graph optimization pass hook. so far, pre/post passes are
 # only applied before/after pattern_matcher in post_grad_passes.
 #
 # def my_custom_pre_pass(graph: torch.fx.graph.Graph):
@@ -423,7 +423,7 @@ class triton:
     # the max number of spills we allow for the configs we benchmark.
     # Setting this to 0 means we skip a config if it spills even a single
     # register.
-    # Settting it to a larger value allows a config spilling a small amount
+    # Setting it to a larger value allows a config spilling a small amount
     # of registers being benchmarked.
     #
     # NOTE: triton will always report >0 register spills for kernels using sin/cos.

diff --git a/torch/_inductor/coordinate_descent_tuner.py b/torch/_inductor/coordinate_descent_tuner.py
@@ -38,10 +38,10 @@ class CoordescTuner:
     """
     The coordinate descent tuner. Tune one field/coordinate at a time.
 
-    TODO will it be necessary to tune multiple fields simultanuously.
+    TODO will it be necessary to tune multiple fields simultaneously.
 
 
-    TODO: what if both increasing and descreasing a field can improve perf.
+    TODO: what if both increasing and decreasing a field can improve perf.
           i.e., there are multiple local optima..
     """
 
@@ -224,7 +224,7 @@ def compare_config(self, func, candidate_config, best_config, best_timing):
         Check if candidate_config is better than best_config.
 
         Return a touple of (compare_result, candidate_timing).
-        compare_result is true iff condidate_config is better.
+        compare_result is true iff candidate_config is better.
         """
         log.debug("Try config %s", candidate_config)
         try:

diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
@@ -247,7 +247,7 @@ def non_empty_tensor(x):
     elif 1 < len(filtered_tensors) < len(tensors):
         # on the first call, when we remove empty tensors, we redispatch recursively
         return aten.cat.default(filtered_tensors, dim)
-    # when no 'filtering' has occured, we raise to prevent infinite recursion (no more decomposition needed)
+    # when no 'filtering' has occurred, we raise to prevent infinite recursion (no more decomposition needed)
     return NotImplemented
 
 

diff --git a/torch/_inductor/fx_passes/quantization.py b/torch/_inductor/fx_passes/quantization.py
@@ -642,7 +642,7 @@ def _register_dequant_promotion_pass(pattern, pass_number):
     )
     def dequant_promotion(match: Match, *args, **kwargs):
         # If dequant pattern used by multiply nodes,
-        # we will do dequant promotion. So each user node has a seperate dequant pattern connected.
+        # we will do dequant promotion. So each user node has a separate dequant pattern connected.
         def clone_to_new_node(graph, source_node, user_node):
             assert (
                 source_node.op == "call_function"

diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
@@ -346,7 +346,7 @@ def decide_layout_opt(gm) -> bool:
         #
         # We disable layout optimization if a model contains aten._scaled_dot_product_flash_attention.
         #
-        # An alternative is to do necessary layout convertion to make sure aten._scaled_dot_product_flash_attention's
+        # An alternative is to do necessary layout conversion to make sure aten._scaled_dot_product_flash_attention's
         # inputs have the layout needed. But that seems to have worse perf than disabing the layout opt.
         # TODO(shunting) revisit if we can still apply layout optimization to models containing sdpa while
         # bringing perf gains.
@@ -909,7 +909,7 @@ def init_wrapper_code(self):
 
         device_types = self.device_types.copy()
         # In terms of some operations that don't have input tensors, we need to
-        # check the deivce of the buffers.
+        # check the device of the buffers.
         for buffer in self.buffers:
             device_types.add(buffer.get_device().type)
         device_types.discard("cpu")

diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
@@ -5918,7 +5918,7 @@ class InPlaceHint(ExternKernel):
     Wrap the input of your inplace op to enable this behavior.
 
     The design is based on two key decisions:
-    - this node is resposible for allocating the in/out buffer used by the collective.
+    - this node is responsible for allocating the in/out buffer used by the collective.
         This is controlled by the ``should_allocate`` method that returns True here and
         False for the collective node
     - The scheduler special-case this node and enable it to reuse its input.
@@ -5958,7 +5958,7 @@ def codegen(self, wrapper):
 class MultiOutputNoSizeAssert(MultiOutput):
     """
     Extract partial output from a multi-output OP.
-    Works like MultiOutput but doesn't assert size. This must be a property guaranteed by the op emiting this.
+    Works like MultiOutput but doesn't assert size. This must be a property guaranteed by the op emitting this.
     """
 
     def __init__(self, layout, input, index):

diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
@@ -1553,7 +1553,7 @@ def make_fallback(kernel, layout_constraint=None, warn=True):
         if torch._dynamo.config.suppress_errors:
             torch._dynamo.config.suppress_errors = False
             log.warning(
-                "A make_fallback error occured in suppress_errors config,"
+                "A make_fallback error occurred in suppress_errors config,"
                 " and suppress_errors is being disabled to surface it."
             )
         raise AssertionError(
@@ -1584,7 +1584,7 @@ def philox_rand_offset(shape):
 @register_lowering(torch.ops.rngprims.philox_rand, type_promotion_kind=None)
 def philox_rand(size, seed, offset, stride, device, dtype):
     # stride arg is optional and will be used in future for distributed random
-    # ops. Currently, its ununsed.
+    # ops. Currently, its unused.
     random_pos = ir.FixedLayout(
         device,
         dtype,

diff --git a/torch/_inductor/pattern_matcher.py b/torch/_inductor/pattern_matcher.py
@@ -531,7 +531,7 @@ def _match(self, node: List[torch.fx.Node], ctx: MatchContext):
         if not isinstance(node, (list, tuple)) or len(node) == 0:
             return FailedMatch("non_list")
         m = Match(self)
-        # Propogating patterns with multiple users will ensure we don't revisit
+        # Propagating patterns with multiple users will ensure we don't revisit
         # the same nodes
         pattern_to_node = ctx.filter_multi_user_patterns()
         matched = False
@@ -851,7 +851,7 @@ def register_replacement(
     """
     Create a replacement rule based on example functions that get traced
     to create patterns.  This supports both training and inference when
-    run on a joint foward+backward graph.
+    run on a joint forward+backward graph.
 
     Args:
         search_fn: traced to give original pattern

diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
@@ -1509,7 +1509,7 @@ def can_fusion_increase_peak_memory(
         The current attempt is a quick, possibly hacky, heuristic to prevent the
         fusion of nodes that are far away in the original order.
 
-        A better but difficult to implement heursitic would be to use live
+        A better but difficult to implement heurisitic would be to use live
         intervals of the buffers, find region of peak pressure in the original
         program and prevent fusion that crosses that peak region. We might need
         special care or good approximation in this implementation, as fusion of

diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
@@ -231,7 +231,7 @@ def prune(index):
     # Note - [On Statically Known]
     #
     # The statically_known_* family of functions below replaces a prior system, called maybe_guard_*. The prior system
-    # operated by providing esentially a question, where the size hinted values were evaluted. If the condition was
+    # operated by providing essentially a question, where the size hinted values were evaluated. If the condition was
     # true, we add a guard and return True, otherwise, False.
     #
     # def maybe_guard_foo(args):

diff --git a/torch/_inductor/triton_heuristics.py b/torch/_inductor/triton_heuristics.py
@@ -1064,7 +1064,7 @@ def reduction(size_hints, reduction_hint=False, meta=None, filename=None):
                 triton_config_reduction(size_hints, 64, 64),
                 triton_config_reduction(size_hints, 8, 512),
                 # halve the XBLOCK/RBLOCK compared to outer_config
-                # TODO: this may only be beneficial when each iteration of the reduciton
+                # TODO: this may only be beneficial when each iteration of the reduction
                 # is quite heavy. E.g. https://gist.github.com/shunting314/189a8ef69f90db9d614a823385147a72
                 triton_config_reduction(size_hints, 64, 4, num_warps=8),
             ],

diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
@@ -840,7 +840,7 @@ def run_and_get_triton_code(fn, *args, **kwargs):
 @contextlib.contextmanager
 def override_lowering(aten_op, override_fn):
     """
-    Override the lowering of aten_op with overide_fn.
+    Override the lowering of aten_op with override_fn.
     The first argument of override_fn is the original lowering fn.
     """
     from torch._inductor import lowering

diff --git a/torch/_inductor/wrapper_benchmark.py b/torch/_inductor/wrapper_benchmark.py
@@ -30,7 +30,7 @@ def get_kernel_category_by_source_code(src_code):
 def get_kernel_category(kernel_mod):
     """
     Given the module defining a triton kernel, return the category of the kernel.
-    Cateogry can be one of:
+    Category can be one of:
     - pointwise
     - reduction
     - persistent_reduction