doc(dfg): a lot of additional documentation for dfg info

flowr-analysis · Dec 5, 2024 · d1aa9f3 · d1aa9f3
1 parent 4a75197
commit d1aa9f3
Show file tree

Hide file tree

Showing 6 changed files with 185 additions and 29 deletions.
diff --git a/src/dataflow/environments/identifier.ts b/src/dataflow/environments/identifier.ts
@@ -6,8 +6,16 @@ export type Identifier = string & { __brand?: 'identifier' }
 
 /**
  * Each reference only has exactly one reference type, stored as the respective number.
- * However, when checking we may want to allow for one of several types,
+ * However, when checking, we may want to allow for one of several types,
  * allowing the combination of the respective bitmasks.
+ *
+ * Having reference types is important as R separates a variable definition from
+ * a function when resolving {@link Identifier|identifier}.
+ * In `c <- 3; print(c(1, 2))` the call to `c` works normally (as the vector constructor),
+ * while writing `c <- function(...) ..1` overshadows the built-in and causes `print` to only output the first element.
+ *
+ * @see {@link isReferenceType} - for checking if a (potentially joint) reference type contains a certain type
+ * @see {@link ReferenceTypeReverseMapping} - for debugging
  */
 export enum ReferenceType {
 	/** The identifier type is unknown */
@@ -28,6 +36,7 @@ export enum ReferenceType {
 	BuiltInFunction = 128
 }
 
+/** Reverse mapping of the reference types so you can get the name from the bitmask (useful for debugging) */
 export const ReferenceTypeReverseMapping = new Map<ReferenceType, string>(Object.entries(ReferenceType).map(([k, v]) => [v as ReferenceType, k]));
 
 /**
@@ -40,12 +49,24 @@ export function isReferenceType(t: ReferenceType, target: ReferenceType): boolea
 export type InGraphReferenceType = Exclude<ReferenceType, ReferenceType.BuiltInConstant | ReferenceType.BuiltInFunction>
 
 /**
- * Something like `a` in `b <- a`.
- * Without any surrounding information, `a` will produce the identifier reference `a`.
- * Similarly, `b` will create a reference.
+ * An identifier reference points to a variable like `a` in `b <- a`.
+ * Without any surrounding code, `a` will produce the identifier reference `a`.
+ * Similarly, `b` will create a reference (although it will be an {@link IdentifierDefinition|identifier definition}
+ * which adds even more information).
+ *
+ * In general,
+ * references are merely pointers (with meta-information) to a vertex in the {@link DataflowGraph|dataflow graph}.
+ * In the context of the extractor, for example,
+ * they indicate the references that are currently (during the analysis at this given node)
+ * {@link DataflowInformation#in|read (`in`)}, {@link DataflowInformation#out|written (`out`)},
+ * or {@link DataflowInformation#unknownReferences|unknown (`unknownReferences`)}.
+ *
+ * @see {@link InGraphIdentifierDefinition}
  */
 export interface IdentifierReference {
-	/** Node which represents the reference in the AST */
+	/**
+	 * The id of the node which represents the reference in the {@link NormalizedAst|normalized AST} and the {@link DataflowGraph|dataflow graph}.
+	 */
 	readonly nodeId:     NodeId
 	/** Name the reference is identified by (e.g., the name of the variable), undefined if the reference is "artificial" (e.g., anonymous) */
 	readonly name:       Identifier | undefined
@@ -58,14 +79,28 @@ export interface IdentifierReference {
 	controlDependencies: ControlDependency[] | undefined
 }
 
-
+/**
+ * The definition of an {@link Identifier|identifier} within the {@link DataflowGraph|graph}.
+ * This extends on the {@link IdentifierReference}
+ * by adding the {@link NodeId} of the definition
+ * (and using `type` to mark the object type).
+ *
+ * Within a code snippet like `a <- 3`, the symbol processor will first create an
+ * {@link IdentifierReference|identifier reference} for `a` to reference the use
+ * and then promote it to an {@link InGraphIdentifierDefinition|identifier definition}.
+ *
+ * @see {@link IdentifierReference}
+ */
 interface InGraphIdentifierDefinition extends IdentifierReference {
 	readonly type:      InGraphReferenceType
 	/** The assignment (or whatever, like `assign` function call) node which ultimately defined this identifier */
 	readonly definedAt: NodeId
 }
 
 /**
- * Stores the definition of an identifier within an {@link IEnvironment}
+ * Stores the definition of an identifier within an {@link IEnvironment}.
+ *
+ * {@link BuiltInIdentifierDefinition} and {@link BuiltInIdentifierConstant} are used for built-in functions and constants only,
+ * so the most important one for your day-to-day R script is the {@link InGraphIdentifierDefinition}.
  */
 export type IdentifierDefinition = InGraphIdentifierDefinition | BuiltInIdentifierDefinition | BuiltInIdentifierConstant
diff --git a/src/dataflow/graph/graph.ts b/src/dataflow/graph/graph.ts
@@ -112,6 +112,12 @@ export interface DataflowGraphJson {
 	readonly edgeInformation:   [NodeId, [NodeId, DataflowGraphEdge][]][]
 }
 
+/**
+ * An unknown side effect describes something that we cannot handle correctly (in all cases).
+ * For example, `eval` will be marked as an unknown side effect as we have no idea of how it will affect the program.
+ * Linked side effects are used whenever we know that a call may be affected by another one in a way that we cannot
+ * grasp from the dataflow perspective (e.g., an indirect dependency based on the currently active graphic device).
+ */
 export type UnknownSidEffect = NodeId | { id: NodeId, linkTo: LinkTo<RegExp> }
 
 /**
@@ -124,7 +130,11 @@ export type UnknownSidEffect = NodeId | { id: NodeId, linkTo: LinkTo<RegExp> }
  * However, this does not have to hold during the construction as edges may point from or to vertices which are yet to be constructed.
  *
  * All methods return the modified graph to allow for chaining.
- * You can use {@link DataflowGraph#fromJson} to construct a dataflow graph object from a deserialized JSON object.
+ *
+ * @see {@link DataflowGraph#addEdge|`addEdge`} - to add an edge to the graph
+ * @see {@link DataflowGraph#addVertex|`addVertex`} - to add a vertex to the graph
+ * @see {@link DataflowGraph#fromJson|`fromJson`} - to construct a dataflow graph object from a deserialized JSON object.
+ * @see {@link emptyGraph} - to create an empty graph (useful in tests)
  */
 export class DataflowGraph<
 	Vertex extends DataflowGraphVertexInfo = DataflowGraphVertexInfo,

diff --git a/src/dataflow/graph/vertex.ts b/src/dataflow/graph/vertex.ts
@@ -14,7 +14,7 @@ export enum VertexType {
 }
 
 /**
- * Arguments required to construct a vertex in the dataflow graph.
+ * Arguments required to construct a vertex in the {@link DataflowGraph|dataflow graph}.
  *
  * @see DataflowGraphVertexUse
  * @see DataflowGraphVertexVariableDefinition
@@ -26,15 +26,17 @@ interface DataflowGraphVertexBase extends MergeableRecord {
 	 */
 	readonly tag:        VertexType
 	/**
-	 * The id of the node (the id assigned by the {@link ParentInformation} decoration)
+	 * The id of the node (the id assigned by the {@link ParentInformation} decoration).
+	 * This unanimously identifies the vertex in the {@link DataflowGraph|dataflow graph}
+	 * as well as the corresponding {@link NormalizedAst|normalized AST}.
 	 */
 	id:                  NodeId
 	/**
 	 * The environment in which the vertex is set.
 	 */
 	environment?:        REnvironmentInformation | undefined
 	/**
-	 * See {@link IdentifierReference}
+	 * @see {@link ControlDependency} - the collection of control dependencies which have an influence on whether the vertex is executed.
 	 */
 	controlDependencies: ControlDependency[] | undefined
 }
@@ -56,15 +58,20 @@ interface DataflowGraphVertexBase extends MergeableRecord {
  * This then returns the corresponding node in the {@link NormalizedAst|normalized AST}, for example,
  * an {@link RNumber} or {@link RString}.
  *
- * This works similarly for {@link IdentifierReference|reference} for which you can use the `id`.
+ * This works similarly for {@link IdentifierReference|identifier references}
+ * for which you can use the {@link IdentifierReference#nodeId|`nodeId`}.
+ *
+ * @see {@link isValueVertex} - to check if a vertex is a value vertex
  */
 export interface DataflowGraphVertexValue extends DataflowGraphVertexBase {
 	readonly tag:          VertexType.Value
 	readonly environment?: undefined
 }
 
 /**
- * Arguments required to construct a vertex which represents the usage of a variable in the dataflow graph.
+ * Arguments required to construct a vertex which represents the usage of a variable in the {@link DataflowGraph|dataflow graph}.
+ *
+ * @see {@link isUseVertex} - to check if a vertex is a use vertex
  */
 export interface DataflowGraphVertexUse extends DataflowGraphVertexBase {
 	readonly tag:          VertexType.Use
@@ -73,7 +80,9 @@ export interface DataflowGraphVertexUse extends DataflowGraphVertexBase {
 }
 
 /**
- * Arguments required to construct a vertex which represents the usage of a variable in the dataflow graph.
+ * Arguments required to construct a vertex which represents the usage of a variable in the {@link DataflowGraph|dataflow graph}.
+ *
+ * @see {@link isFunctionCallVertex} - to check if a vertex is a function call vertex
  */
 export interface DataflowGraphVertexFunctionCall extends DataflowGraphVertexBase {
 	readonly tag:  VertexType.FunctionCall
@@ -93,14 +102,21 @@ export interface DataflowGraphVertexFunctionCall extends DataflowGraphVertexBase
 }
 
 /**
- * Arguments required to construct a vertex which represents the definition of a variable in the dataflow graph.
+ * Arguments required to construct a vertex which represents the definition of a variable in the {@link DataflowGraph|dataflow graph}.
+ *
+ * @see {@link isVariableDefinitionVertex} - to check if a vertex is a variable definition vertex
  */
 export interface DataflowGraphVertexVariableDefinition extends DataflowGraphVertexBase {
 	readonly tag:          VertexType.VariableDefinition
 	/** Does not require an environment, those are attached to the call */
 	readonly environment?: undefined
 }
 
+/**
+ * Arguments required to construct a vertex which represents the definition of a function in the {@link DataflowGraph|dataflow graph}.
+ *
+ * @see {@link isFunctionDefinitionVertex} - to check if a vertex is a function definition vertex
+ */
 export interface DataflowGraphVertexFunctionDefinition extends DataflowGraphVertexBase {
 	readonly tag: VertexType.FunctionDefinition
 	/**
@@ -130,25 +146,43 @@ export type DataflowGraphVertexArgument = DataflowGraphVertexUse | DataflowGraph
  */
 export type DataflowGraphVertexInfo = Required<DataflowGraphVertexArgument>
 
+/**
+ * A mapping of {@link NodeId}s to {@link DataflowGraphVertexInfo|vertices}.
+ */
 export type DataflowGraphVertices<Vertex extends DataflowGraphVertexInfo = DataflowGraphVertexInfo> = Map<NodeId, Vertex>
 
 
+/**
+ * Check if the given vertex is a {@link DataflowGraphVertexValue|value vertex}.
+ */
 export function isValueVertex(vertex: DataflowGraphVertexBase): vertex is DataflowGraphVertexValue {
 	return vertex.tag === VertexType.Value;
 }
 
+/**
+ * Check if the given vertex is a {@link DataflowGraphVertexUse|use vertex}.
+ */
 export function isUseVertex(vertex: DataflowGraphVertexBase): vertex is DataflowGraphVertexUse {
 	return vertex.tag === VertexType.Use;
 }
 
+/**
+ * Check if the given vertex is a {@link DataflowGraphVertexFunctionCall|function call vertex}.
+ */
 export function isFunctionCallVertex(vertex: DataflowGraphVertexBase): vertex is DataflowGraphVertexFunctionCall {
 	return vertex.tag === VertexType.FunctionCall;
 }
 
+/**
+ * Check if the given vertex is a {@link DataflowGraphVertexVariableDefinition|variable definition vertex}.
+ */
 export function isVariableDefinitionVertex(vertex: DataflowGraphVertexBase): vertex is DataflowGraphVertexVariableDefinition {
 	return vertex.tag === VertexType.VariableDefinition;
 }
 
+/**
+ * Check if the given vertex is a {@link DataflowGraphVertexFunctionDefinition|function definition vertex}.
+ */
 export function isFunctionDefinitionVertex(vertex: DataflowGraphVertexBase): vertex is DataflowGraphVertexFunctionDefinition {
 	return vertex.tag === VertexType.FunctionDefinition;
 }

diff --git a/src/dataflow/info.ts b/src/dataflow/info.ts
@@ -5,6 +5,27 @@ import type { REnvironmentInformation } from './environments/environment';
 import { DataflowGraph } from './graph/graph';
 import type { GenericDifferenceInformation, WriteableDifferenceReport } from '../util/diff';
 
+
+/**
+ * A control dependency links a vertex to the control flow element which
+ * may have an influence on its execution.
+ * Within `if(p) a else b`, `a` and `b` have a control dependency on the `if` (which in turn decides based on `p`).
+ *
+ * @see {@link happensInEveryBranch} - to check whether a list of control dependencies is exhaustive
+ */
+export interface ControlDependency {
+	/** The id of the node that causes the control dependency to be active (e.g., the condition of an if) */
+	readonly id:    NodeId,
+	/** when does this control dependency trigger (if the condition is true or false)? */
+	readonly when?: boolean
+}
+
+
+/**
+ * Classifies the type of exit point encountered.
+ *
+ * @see {@link ExitPoint}
+ */
 export const enum ExitPointType {
 	/** The exit point is the implicit (last executed expression of a function/block) */
 	Default = 0,
@@ -16,23 +37,32 @@ export const enum ExitPointType {
 	Next = 3
 }
 
-export interface ControlDependency {
-	/** The id of the node that causes the control dependency to be active (e.g., the condition of an if) */
-	readonly id:    NodeId,
-	/** when does this control dependency trigger (if the condition is true or false)? */
-	readonly when?: boolean
-}
-
-
+/**
+ * An exit point describes the position which ends the current control flow structure.
+ * This may be as innocent as the last expression or explicit with a `return`/`break`/`next`.
+ *
+ * @see {@link ExitPointType} - for the different types of exit points
+ * @see {@link addNonDefaultExitPoints} - to easily modify lists of exit points
+ * @see {@link alwaysExits} - to check whether a list of control dependencies always triggers an exit
+ * @see {@link filterOutLoopExitPoints} - to remove loop exit points from a list
+ */
 export interface ExitPoint {
 	/** What kind of exit point is this one? May be used to filter for exit points of specific causes. */
 	readonly type:                ExitPointType,
 	/** The id of the node which causes the exit point! */
 	readonly nodeId:              NodeId,
-	/** Control dependencies which influence if the exit point triggers (e.g., if the `return` is contained within an `if` statement) */
+	/**
+	 * Control dependencies which influence if the exit point triggers
+	 * (e.g., if the `return` is contained within an `if` statement).
+	 *
+	 * @see {@link happensInEveryBranch} - to check whether control dependencies are exhaustive
+	 */
 	readonly controlDependencies: ControlDependency[] | undefined
 }
 
+/**
+ * Adds all non-default exit points to the existing list.
+ */
 export function addNonDefaultExitPoints(existing: ExitPoint[], add: readonly ExitPoint[]): void {
 	existing.push(...add.filter(({ type }) => type !== ExitPointType.Default));
 }
@@ -46,22 +76,50 @@ export interface DataflowCfgInformation {
 }
 
 /**
- * The dataflow information is continuously updated during the dataflow analysis
+ * The dataflow information is one of the fundamental structures we have in the dataflow analysis.
+ * It is continuously updated during the dataflow analysis
  * and holds its current state for the respective subtree processed.
+ * Each processor during the dataflow analysis may use the information from its children
+ * to produce a new state of the dataflow information.
+ *
+ * You may initialize a new dataflow information with {@link initializeCleanDataflowInformation}.
+ *
+ * @see {@link DataflowCfgInformation} - the control flow aspects
  */
 export interface DataflowInformation extends DataflowCfgInformation {
-	/** References that have not been identified as read or write and will be so on higher */
+	/**
+	 * References that have not been identified as read or write and will be so on higher processors.
+	 *
+	 * For example, when we analyze the `x` vertex in `x <- 3`, we will first create an unknown reference for `x`
+	 * as we have not yet seen the assignment!
+	 *
+	 * @see {@link IdentifierReference} - a reference on a variable, parameter, function call, ...
+	 */
 	unknownReferences: readonly IdentifierReference[]
-	/** References which are read */
+	/**
+	 * References which are read within the current subtree.
+	 *
+	 * @see {@link IdentifierReference} - a reference on a variable, parameter, function call, ...
+	 * */
 	in:                readonly IdentifierReference[]
-	/** References which are written to */
+	/**
+	 * References which are written to within the current subtree
+	 *
+	 * @see {@link IdentifierReference} - a reference on a variable, parameter, function call, ...
+	 */
 	out:               readonly IdentifierReference[]
 	/** Current environments used for name resolution, probably updated on the next expression-list processing */
 	environment:       REnvironmentInformation
 	/** The current constructed dataflow graph */
 	graph:             DataflowGraph
 }
 
+/**
+ * Initializes an empty {@link DataflowInformation} object with the given entry point and data.
+ * This is to be used as a "starting point" when processing leaf nodes during the dataflow extraction.
+ *
+ * @see {@link DataflowInformation}
+ */
 export function initializeCleanDataflowInformation<T>(entryPoint: NodeId, data: Pick<DataflowProcessorInformation<T>, 'environment' | 'completeAst'>): DataflowInformation {
 	return {
 		unknownReferences: [],
@@ -74,6 +132,10 @@ export function initializeCleanDataflowInformation<T>(entryPoint: NodeId, data:
 	};
 }
 
+/**
+ * Checks whether the given control dependencies are exhaustive (i.e. if for every control dependency on a boolean,
+ * the list contains a dependency on the `true` and on the `false` case).
+ */
 export function happensInEveryBranch(controlDependencies: readonly ControlDependency[] | undefined): boolean {
 	if(controlDependencies === undefined) {
 		/* the cds are unconstrained */
@@ -97,12 +159,19 @@ export function happensInEveryBranch(controlDependencies: readonly ControlDepend
 	return trues.every(id => falseSet.has(id));
 }
 
+/**
+ * Checks whether the given dataflow information always exits (i.e., if there is a non-default exit point in every branch).
+ * @see {@link ExitPoint} - for the different types of exit points
+ */
 export function alwaysExits(data: DataflowInformation): boolean {
 	return data.exitPoints?.some(
 		e => e.type !== ExitPointType.Default && happensInEveryBranch(e.controlDependencies)
 	) ?? false;
 }
 
+/**
+ * Filters out exit points which end their cascade within a loop.
+ */
 export function filterOutLoopExitPoints(exitPoints: readonly ExitPoint[]): readonly ExitPoint[] {
 	return exitPoints.filter(({ type }) => type === ExitPointType.Return || type === ExitPointType.Default);
 }