From 4a156d4e43d328e20a1db2e51b0efcf305e61137 Mon Sep 17 00:00:00 2001
From: Myles Scolnick <myles@marimo.io>
Date: Thu, 12 Dec 2024 13:28:45 -0500
Subject: [PATCH] experimental: smarted detection of edited code

---
 frontend/package.json                         |   2 +
 frontend/pnpm-lock.yaml                       |  21 +-
 frontend/src/core/cells/cells.ts              |  10 +-
 .../core/codemirror/__tests__/edited.test.ts  | 259 ++++++++++++++++++
 frontend/src/core/codemirror/edited.ts        | 107 ++++++++
 5 files changed, 391 insertions(+), 8 deletions(-)
 create mode 100644 frontend/src/core/codemirror/__tests__/edited.test.ts
 create mode 100644 frontend/src/core/codemirror/edited.ts

diff --git a/frontend/package.json b/frontend/package.json
index c82a9bf6386..0b346dd1456 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -37,6 +37,7 @@
     "@lezer/common": "^1.2.1",
     "@lezer/highlight": "^1.2.1",
     "@lezer/lr": "^1.4.2",
+    "@lezer/python": "^1.1.15",
     "@marimo-team/marimo-api": "file:../openapi",
     "@marimo-team/react-slotz": "^0.1.8",
     "@open-rpc/client-js": "^1.8.1",
@@ -107,6 +108,7 @@
     "js-cookie": "^3.0.5",
     "katex": "^0.16.11",
     "lodash-es": "^4.17.21",
+    "lru-cache": "^11.0.2",
     "lucide-react": "^0.428.0",
     "lz-string": "^1.5.0",
     "mermaid": "^11.3.0",
diff --git a/frontend/pnpm-lock.yaml b/frontend/pnpm-lock.yaml
index 385352284f8..d07452a465d 100644
--- a/frontend/pnpm-lock.yaml
+++ b/frontend/pnpm-lock.yaml
@@ -87,6 +87,9 @@ importers:
       '@lezer/lr':
         specifier: ^1.4.2
         version: 1.4.2
+      '@lezer/python':
+        specifier: ^1.1.15
+        version: 1.1.15
       '@marimo-team/marimo-api':
         specifier: file:../openapi
         version: file:../openapi
@@ -297,6 +300,9 @@ importers:
       lodash-es:
         specifier: ^4.17.21
         version: 4.17.21
+      lru-cache:
+        specifier: ^11.0.2
+        version: 11.0.2
       lucide-react:
         specifier: ^0.428.0
         version: 0.428.0(react@18.3.1)
@@ -1469,8 +1475,8 @@ packages:
   '@lezer/php@1.0.1':
     resolution: {integrity: sha512-aqdCQJOXJ66De22vzdwnuC502hIaG9EnPK2rSi+ebXyUd+j7GAX1mRjWZOVOmf3GST1YUfUCu6WXDiEgDGOVwA==}
 
-  '@lezer/python@1.1.7':
-    resolution: {integrity: sha512-RbhKQ9+Y/r/Xv6OcJmETEM5tBFdpdAJRqrgi3akJkWBLCuiAaLP/jKdYzu+ICljaSXPCQeznrv+r9HUEnjq3HQ==}
+  '@lezer/python@1.1.15':
+    resolution: {integrity: sha512-aVQ43m2zk4FZYedCqL0KHPEUsqZOrmAvRhkhHlVPnDD1HODDyyQv5BRIuod4DadkgBEZd53vQOtXTonNbEgjrQ==}
 
   '@lezer/rust@1.0.1':
     resolution: {integrity: sha512-j+ToFKM6Wpglv3OQ4ebHYdYIMT2dh0ziCCV0rTf47AWiHOVhR0WjaKrBq+yuvDQNEhr5sxPxVI7+naJIgpqcsQ==}
@@ -6289,6 +6295,10 @@ packages:
   loupe@3.1.2:
     resolution: {integrity: sha512-23I4pFZHmAemUnz8WZXbYRSKYj801VDaNv9ETuMh7IrMc7VuVVSo+Z9iLE3ni30+U48iDWfi30d3twAXBYmnCg==}
 
+  lru-cache@11.0.2:
+    resolution: {integrity: sha512-123qHRfJBmo2jXDbo/a5YOQrJoHF/GNQTLzQ5+IdK5pWpceK17yRc6ozlWd25FxvGKQbIUs91fDFkXmDHTKcyA==}
+    engines: {node: 20 || >=22}
+
   lru-cache@5.1.1:
     resolution: {integrity: sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==}
 
@@ -9458,7 +9468,7 @@ snapshots:
       '@codemirror/language': 6.10.3
       '@codemirror/state': 6.4.1
       '@lezer/common': 1.2.1
-      '@lezer/python': 1.1.7
+      '@lezer/python': 1.1.15
     transitivePeerDependencies:
       - '@codemirror/view'
 
@@ -10061,8 +10071,9 @@ snapshots:
       '@lezer/highlight': 1.2.1
       '@lezer/lr': 1.4.2
 
-  '@lezer/python@1.1.7':
+  '@lezer/python@1.1.15':
     dependencies:
+      '@lezer/common': 1.2.1
       '@lezer/highlight': 1.2.1
       '@lezer/lr': 1.4.2
 
@@ -15958,6 +15969,8 @@ snapshots:
 
   loupe@3.1.2: {}
 
+  lru-cache@11.0.2: {}
+
   lru-cache@5.1.1:
     dependencies:
       yallist: 3.1.1
diff --git a/frontend/src/core/cells/cells.ts b/frontend/src/core/cells/cells.ts
index 15a2a1587b6..b9e90e396d1 100644
--- a/frontend/src/core/cells/cells.ts
+++ b/frontend/src/core/cells/cells.ts
@@ -51,6 +51,7 @@ import {
 } from "@/utils/id-tree";
 import { isEqual } from "lodash-es";
 import { isErrorMime } from "../mime";
+import { areLogicallyDifferent } from "../codemirror/edited";
 
 export const SCRATCH_CELL_ID = "__scratch__" as CellId;
 
@@ -246,7 +247,7 @@ const {
           code,
           lastCodeRun,
           lastExecutionTime,
-          edited: Boolean(code) && code !== lastCodeRun,
+          edited: Boolean(code) && areLogicallyDifferent(code, lastCodeRun),
         }),
       },
       cellRuntime: {
@@ -640,7 +641,7 @@ const {
         : {
             ...cell,
             code: code,
-            edited: code.trim() !== cell.lastCodeRun,
+            edited: areLogicallyDifferent(code, cell.lastCodeRun),
           };
     });
   },
@@ -987,7 +988,7 @@ const {
           code: beforeCursorCode,
           edited:
             Boolean(beforeCursorCode) &&
-            beforeCursorCode.trim() !== cell.lastCodeRun?.trim(),
+            areLogicallyDifferent(beforeCursorCode, cell.lastCodeRun),
         },
         [newCellId]: createCell({
           id: newCellId,
@@ -1035,7 +1036,8 @@ const {
           ...cell,
           code: snapshot,
           edited:
-            Boolean(snapshot) && snapshot?.trim() !== cell.lastCodeRun?.trim(),
+            Boolean(snapshot) &&
+            areLogicallyDifferent(snapshot, cell.lastCodeRun),
         },
       },
       cellRuntime: {
diff --git a/frontend/src/core/codemirror/__tests__/edited.test.ts b/frontend/src/core/codemirror/__tests__/edited.test.ts
new file mode 100644
index 00000000000..cf081b1efb3
--- /dev/null
+++ b/frontend/src/core/codemirror/__tests__/edited.test.ts
@@ -0,0 +1,259 @@
+/* Copyright 2024 Marimo. All rights reserved. */
+import { describe, expect, test } from "vitest";
+import { areLogicallyDifferent } from "../edited";
+
+describe("areLogicallyDifferent - same code", () => {
+  test("identical code returns false", () => {
+    const code = "def foo(): return 42";
+    expect(areLogicallyDifferent(code, code)).toBe(false);
+  });
+
+  test("different whitespace returns false", () => {
+    const code1 = "def foo():\n\n\n    return 42";
+    const code2 = "def foo(): return 42";
+    expect(areLogicallyDifferent(code1, code2)).toBe(false);
+  });
+
+  test("different comments return false", () => {
+    const code1 = "def foo(): # first comment\n    return 42";
+    const code2 = "def foo(): # different comment\n    return 42";
+    expect(areLogicallyDifferent(code1, code2)).toBe(false);
+  });
+
+  test("new lines before and after return statements are ignored", () => {
+    const code1 = "def foo(): return 42\n";
+    const code2 = "def foo(): return 42\n\n";
+    const code3 = "\n\ndef foo(): return 42";
+    expect(areLogicallyDifferent(code1, code2)).toBe(false);
+    expect(areLogicallyDifferent(code1, code3)).toBe(false);
+    expect(areLogicallyDifferent(code2, code3)).toBe(false);
+  });
+
+  test("handles unicode characters", () => {
+    const code1 = 'print("Hello 世界")';
+    const code2 = 'print("Hello 世界")';
+    expect(areLogicallyDifferent(code1, code2)).toBe(false);
+  });
+
+  test("complex class definitions", () => {
+    const code1 = `
+      class MyClass:
+          def __init__(self):
+              self.x = 42
+
+          def method(self):
+              return self.x
+    `;
+    const code2 = `
+      class MyClass:
+          def __init__(self):  # Initialize
+              self.x = 42      # Set x
+
+          def method(self):    # Get x
+              return self.x    # Return it
+    `;
+    expect(areLogicallyDifferent(code1, code2)).toBe(false);
+  });
+
+  test("function with types", () => {
+    expect(
+      areLogicallyDifferent(
+        "def foo(x: int) -> int: return x",
+        "def foo(x: int) -> int: return x",
+      ),
+    ).toBe(false);
+  });
+});
+
+describe("areLogicallyDifferent - different code", () => {
+  test("multiline comments return true", () => {
+    const code1 = `"""
+      This is a long docstring
+      with multiple lines
+    """
+    def foo(): return 42`;
+    const code2 = `"""Different docstring"""
+    def foo(): return 42`;
+    expect(areLogicallyDifferent(code1, code2)).toBe(true);
+  });
+
+  test("different function names return true", () => {
+    const code1 = "def foo(): return 42";
+    const code2 = "def bar(): return 42";
+    expect(areLogicallyDifferent(code1, code2)).toBe(true);
+  });
+
+  test("function with parsing error", () => {
+    const code1 = "def foo(): return 42";
+    const code2 = "def foo():\nreturn 42";
+    expect(areLogicallyDifferent(code1, code2)).toBe(true);
+  });
+
+  test("different string literals return true", () => {
+    const code1 = 'print("hello")';
+    const code2 = 'print("world")';
+    expect(areLogicallyDifferent(code1, code2)).toBe(true);
+  });
+
+  test("different number literals return true", () => {
+    const code1 = "x = 42";
+    const code2 = "x = 43";
+    expect(areLogicallyDifferent(code1, code2)).toBe(true);
+  });
+
+  test("different operators return true", () => {
+    const code1 = "x + y";
+    const code2 = "x - y";
+    expect(areLogicallyDifferent(code1, code2)).toBe(true);
+  });
+
+  test("different control flow returns true", () => {
+    const code1 = "if x: pass";
+    const code2 = "while x: pass";
+    expect(areLogicallyDifferent(code1, code2)).toBe(true);
+  });
+
+  test("different indentation structure returns true", () => {
+    const code1 = "if x:\n    if y: pass";
+    const code2 = "if x:\n    pass\nif y: pass";
+    expect(areLogicallyDifferent(code1, code2)).toBe(true);
+  });
+
+  test("handles invalid Python syntax", () => {
+    const code1 = "this is not python";
+    const code2 = "also not python";
+    expect(areLogicallyDifferent(code1, code2)).toBe(true);
+  });
+
+  test("decorators are preserved", () => {
+    const code1 = "@decorator\ndef func(): pass";
+    const code2 = "def func(): pass";
+    expect(areLogicallyDifferent(code1, code2)).toBe(true);
+  });
+
+  test("type hints affect comparison", () => {
+    const code1 = "def func(x: int) -> int: return x";
+    const code2 = "def func(x): return x";
+    expect(areLogicallyDifferent(code1, code2)).toBe(true);
+  });
+
+  test("different string quote styles return true", () => {
+    const code1 = 'x = "hello"';
+    const code2 = "x = 'hello'";
+    expect(areLogicallyDifferent(code1, code2)).toBe(true);
+  });
+});
+
+describe("areLogicallyDifferent - advanced Python features", () => {
+  test("lambda functions", () => {
+    expect(
+      areLogicallyDifferent("x = lambda a: a + 1", "x = lambda b: b + 1"),
+    ).toBe(true);
+
+    expect(
+      areLogicallyDifferent("x = lambda a: a + 1", "x = lambda a: a - 1"),
+    ).toBe(true);
+  });
+
+  test("list comprehensions", () => {
+    expect(
+      areLogicallyDifferent(
+        "[x for x in range(10) if x % 2 == 0]",
+        "[x for x in range(10) if x % 2 == 1]",
+      ),
+    ).toBe(true);
+  });
+
+  test("try/except blocks", () => {
+    const code1 = `
+      try:
+        raise ValueError()
+      except ValueError:
+        pass
+    `;
+    const code2 = `
+      try:
+        raise ValueError()
+      except Exception:
+        pass
+    `;
+    expect(areLogicallyDifferent(code1, code2)).toBe(true);
+  });
+
+  test("multiline strings with different line endings", () => {
+    expect(
+      areLogicallyDifferent(
+        'x = """hello\nworld\n"""',
+        'x = """hello\r\nworld\r\n"""',
+      ),
+    ).toBe(true);
+  });
+
+  test("nested function definitions", () => {
+    const code1 = `
+      def outer():
+        def inner(): pass
+        return inner
+    `;
+    const code2 = `
+      def outer():
+        def different(): pass
+        return different
+    `;
+    expect(areLogicallyDifferent(code1, code2)).toBe(true);
+  });
+
+  test("number literals in different bases", () => {
+    expect(areLogicallyDifferent("x = 0xFF", "x = 255")).toBe(true);
+
+    expect(areLogicallyDifferent("x = 0o10", "x = 8")).toBe(true);
+  });
+
+  test("f-strings vs regular strings", () => {
+    expect(areLogicallyDifferent('x = f"hello {42}"', 'x = "hello 42"')).toBe(
+      true,
+    );
+  });
+
+  test("async functions", () => {
+    const code1 = "async def foo(): pass";
+    const code2 = "def foo(): pass";
+    expect(areLogicallyDifferent(code1, code2)).toBe(true);
+  });
+
+  test("multiple assignments", () => {
+    expect(areLogicallyDifferent("a = b = c = 1", "a, b, c = 1, 1, 1")).toBe(
+      true,
+    );
+  });
+
+  test("walrus operator", () => {
+    expect(
+      areLogicallyDifferent("if (x := 42) > 0: pass", "x = 42\nif x > 0: pass"),
+    ).toBe(true);
+  });
+
+  test("function with types", () => {
+    expect(
+      areLogicallyDifferent(
+        "def foo(x: int) -> int: return x",
+        "def foo(x: float) -> int: return x",
+      ),
+    ).toBe(true);
+  });
+});
+
+test("handles empty strings", () => {
+  expect(areLogicallyDifferent("", "")).toBe(false);
+  expect(areLogicallyDifferent("pass", "")).toBe(true);
+});
+
+test("handles invalid python code", () => {
+  const code1 = "x = 1 +";
+  const code2 = "x = 1 +";
+  expect(areLogicallyDifferent(code1, code2)).toBe(false);
+
+  const code3 = "x = 1 +";
+  const code4 = "x = 1 -";
+  expect(areLogicallyDifferent(code3, code4)).toBe(true);
+});
diff --git a/frontend/src/core/codemirror/edited.ts b/frontend/src/core/codemirror/edited.ts
new file mode 100644
index 00000000000..848f7f19e9a
--- /dev/null
+++ b/frontend/src/core/codemirror/edited.ts
@@ -0,0 +1,107 @@
+/* Copyright 2024 Marimo. All rights reserved. */
+import type { Tree } from "@lezer/common";
+import { parser } from "@lezer/python";
+import { LRUCache } from "lru-cache";
+import { Logger } from "@/utils/Logger";
+
+const IGNORE_NODES = new Set(["Comment", "LineComment", "BlockComment", " "]);
+
+// Helper function to get a simplified string representation of the AST
+function getASTString(tree: Tree, code: string): string {
+  const result: string[] = [];
+  const cursor = tree.cursor();
+
+  do {
+    if (IGNORE_NODES.has(cursor.name)) {
+      continue;
+    }
+
+    if (cursor.type.isError) {
+      result.push(code.slice(cursor.from, cursor.to) || "ERROR");
+      continue;
+    }
+
+    switch (cursor.name) {
+      case "VariableName":
+      case "AssignOp":
+      case "Number":
+      case "String":
+      case "Identifier":
+      case "ArithOp":
+      case "CompareOp":
+      case "BinaryOp":
+      case "UnaryOp":
+        result.push(`${cursor.name}(${code.slice(cursor.from, cursor.to)})`);
+        break;
+      default:
+        if (cursor.name) {
+          result.push(cursor.name);
+        }
+    }
+  } while (cursor.next());
+
+  return result.join(":");
+}
+
+// LRU cache for parsed trees
+const treeCache = new LRUCache<string, Tree>({
+  max: 100,
+});
+
+function parseAndCache(code: string): Tree {
+  const cached = treeCache.get(code);
+  if (cached) {
+    return cached;
+  }
+
+  const tree = parser.parse(code);
+  treeCache.set(code, tree);
+  return tree;
+}
+
+/**
+ * Compares two Python code snippets for logical differences,
+ * ignoring whitespace, comments, and formatting differences.
+ *
+ * This function should return as fast as possible.
+ *
+ * Optimizations that won't work:
+ * - Quick comparison after stripping whitespace, because whitespace can
+ * exist in strings.
+ * - Quick node count comparison, since they may contain comments or other
+ * ignored nodes.
+ */
+export function areLogicallyDifferent(
+  code1: string | undefined | null,
+  code2: string | undefined | null,
+): boolean {
+  if (code1 == null || code2 == null) {
+    return code1 !== code2;
+  }
+
+  code1 = code1.trim();
+  code2 = code2.trim();
+
+  // Quick equality check before parsing
+  if (code1 === code2) {
+    return false;
+  }
+
+  // If either code is empty (after trimming), we consider them different
+  if (code1 === "" || code2 === "") {
+    return true;
+  }
+
+  try {
+    const tree1 = parseAndCache(code1);
+    const tree2 = parseAndCache(code2);
+
+    const ast1String = getASTString(tree1, code1);
+    const ast2String = getASTString(tree2, code2);
+
+    return ast1String !== ast2String;
+  } catch {
+    Logger.error("Failed to parse code", { code1, code2 });
+    return code1 !== code2;
+  }
+}