diff --git a/.github/workflows/get_function_data_test.yml b/.github/workflows/get_function_data_test.yml index 99696e0..2b72573 100644 --- a/.github/workflows/get_function_data_test.yml +++ b/.github/workflows/get_function_data_test.yml @@ -6,19 +6,41 @@ on: jobs: test: - runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 + with: + path: 'functionRetriever' # Specify a path for the main repo + + - name: Clone testRepo repository + run: | + mkdir -p ${{ github.workspace }}/inputData # Create inputData directory + git clone https://github.com/RapidReview-ai/testRepo ${{ github.workspace }}/inputData/testRepo + + - name: Create outputData directory + run: mkdir -p ${{ github.workspace }}/functionRetriever/outputData + - name: Set up Python 3.x uses: actions/setup-python@v2 with: python-version: '3.11.1' - - name: Install dependencies + + - name: Set up Node.js + uses: actions/setup-node@v2 + with: + node-version: '18.12.1' + + - name: Install npm dependencies + run: npm install + working-directory: ${{ github.workspace }}/functionRetriever + + - name: Install Python dependencies run: | pip install --upgrade pip - # Install any other dependencies your project requires: - # pip install -r requirements.txt + pip install -r requirements.txt + working-directory: ${{ github.workspace }}/functionRetriever + - name: Run tests run: python test_get_function_data.py + working-directory: ${{ github.workspace }}/functionRetriever diff --git a/.gitignore b/.gitignore index 45daf54..a34912e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,9 @@ .env -**.json +function_changes.json +test_function_changes.json +package-lock.json path .DS_Store -**/__pycache__ \ No newline at end of file +**/__pycache__ +node_modules +temp.js \ No newline at end of file diff --git a/babelParser.js b/babelParser.js new file mode 100644 index 0000000..42ccc9b --- /dev/null +++ b/babelParser.js @@ -0,0 +1,14 @@ +const babel = require('@babel/parser'); +const fs = require('fs'); + +const code = fs.readFileSync(process.argv[2], 'utf8'); + +try { + const ast = babel.parse(code, { + sourceType: "module", + plugins: [], + }); + console.log(JSON.stringify(ast)); +} catch (error) { + console.error("Parsing error:", error); +} diff --git a/createEmbeddings.py b/createEmbeddings.py index 31aef36..628764b 100644 --- a/createEmbeddings.py +++ b/createEmbeddings.py @@ -7,7 +7,7 @@ from qdrant_client import QdrantClient from qdrant_client.models import CollectionDescription, Distance, VectorParams, Record -def embed_sample_functions(): +def embed_sample_functions(repo_path): # Initialize Qdrant Client client = QdrantClient(host='localhost', port=6333) # client = QdrantClient(":memory:") @@ -17,7 +17,7 @@ def embed_sample_functions(): openai.api_key = os.getenv("OPENAI_API_KEY") # Load the JSON data from the file - json_file_path = 'outputData/function_changes.json' # depends on how you run the file, should be changed to be global and not local path + json_file_path = 'outputData/test_function_changes.json' if repo_path.endswith('testRepo') else 'outputData/function_changes.json' with open(json_file_path, 'r') as file: json_data = json.load(file) diff --git a/getFunctionData.py b/getFunctionData.py index 1e646e9..faede1c 100644 --- a/getFunctionData.py +++ b/getFunctionData.py @@ -1,92 +1,202 @@ +import os import git import json -import re -import os +import time +import subprocess -def get_function_data(repo_path='../inputData/testRepo2'): - # Determine the output file based on the original repo_path - output_file = 'outputData/test_function_changes.json' if repo_path.endswith('testRepo2') else 'outputData/function_changes.json' - - # Determine the directory where this script is located +def get_function_data(repo_path='../inputData/testRepo'): + output_file = 'outputData/test_function_changes.json' if repo_path.endswith('testRepo') else 'outputData/function_changes.json' script_dir = os.path.dirname(os.path.abspath(__file__)) - - # Construct the path to your repository relative to the script's location repo_path = os.path.join(script_dir, repo_path) - repo = git.Repo(repo_path) - - # Pull the latest changes from the main branch + repo = git.Repo(repo_path) + repo.git.checkout('main') repo.git.pull() merge_commits = [commit for commit in repo.iter_commits('main') if commit.parents and len(commit.parents) > 1] - merge_commits.reverse() # Reverse the list to get the oldest merge commit first + merge_commits.reverse() + + def create_temp_file_and_get_ast(file_content, temp_file_path='temp.js'): + with open(temp_file_path, 'w') as f: + f.write(file_content) + ast = get_ast_from_js(file_content, temp_file_path) + if os.path.exists(temp_file_path): + os.remove(temp_file_path) # Clean up the temporary file + return ast + + def get_ast_from_js(file_content, temp_file_path): + with open(temp_file_path, 'w') as temp_file: + temp_file.write(file_content) + result = subprocess.run(['node', 'babelParser.js', temp_file_path], capture_output=True, text=True) + if result.stderr: + print("Error in parsing:", result.stderr) + return None + return json.loads(result.stdout) + + def get_functions_from_file(file_content): + + # create ast from file content + ast = create_temp_file_and_get_ast(file_content) + + functions = [] + try: + # Traverse the AST to find function declarations + def traverse(node): + if not isinstance(node, dict): + return + + if 'type' in node: + # Check for arrow functions or function expressions assigned to variables + if node['type'] in ['VariableDeclarator'] and 'init' in node: + init_node = node['init'] + if init_node and 'type' in init_node and init_node['type'] in ['FunctionExpression', 'ArrowFunctionExpression']: + function_name = None + if 'name' in node['id']: + function_name = node['id']['name'] + if function_name: + functions.append(function_name) + + # Existing checks for FunctionDeclaration, etc. + elif node['type'] in ['FunctionDeclaration', 'FunctionExpression', 'ArrowFunctionExpression']: + function_name = None + if 'id' in node and node['id'] is not None: + function_name = node['id']['name'] + elif 'key' in node and 'name' in node['key']: + function_name = node['key']['name'] + if function_name: + functions.append(function_name) + + # Check for methods in classes + if node['type'] == 'MethodDefinition' and 'key' in node and node['key']['type'] == 'Identifier': + functions.append(node['key']['name']) + + # Recursively traverse child nodes + for key, value in node.items(): + if isinstance(value, dict): + traverse(value) + elif isinstance(value, list): + for item in value: + if isinstance(item, dict): + traverse(item) + + traverse(ast['program']) + except Exception as e: + print(f"Error processing AST: {e}") + return functions + + def normalize_change_counts(functions): + # Find the min and max changes after merge + min_changes = min(functions.values(), key=lambda x: x['changes_after_merge'])['changes_after_merge'] + max_changes = max(functions.values(), key=lambda x: x['changes_after_merge'])['changes_after_merge'] + + # Normalize the change counts between -1 and 1 + for func_key, func_info in functions.items(): + if max_changes != min_changes: + normalized_score = 2 * ((func_info['changes_after_merge'] - min_changes) / (max_changes - min_changes)) - 1 + else: + normalized_score = 0 + func_info['score'] = normalized_score + + return functions - def get_func_name(diff): - pattern = re.compile(r'function\s+([^\(]+)\s*\(([^)]*)\)\s*{', re.MULTILINE) - return pattern.findall(diff) def get_full_function_at_commit(repo, commit_hash, function_name, file_path): commit = repo.commit(commit_hash) blob = commit.tree / file_path file_content = blob.data_stream.read().decode('utf-8') - pattern = re.compile(r'function\s+' + re.escape(function_name) + r'\s*\((.*?)\)\s*\{([\s\S]*?)\}', re.MULTILINE) - match = pattern.search(file_content) - - if match: - full_function = f"function {function_name}({match.group(1)}) {{{match.group(2)}}}" - return full_function + # create ast from file content + ast = create_temp_file_and_get_ast(file_content) + + try: + # Define a function to recursively search for the function + def find_function(node, function_name): + if not isinstance(node, dict): + return None + + # Handle different types of function nodes + if node.get('type') == 'FunctionDeclaration' and node.get('id', {}).get('name') == function_name: + return node.get('start'), node.get('end') + + if node.get('type') == 'VariableDeclarator': + init_node = node.get('init') + if isinstance(init_node, dict) and init_node.get('type') in ['FunctionExpression', 'ArrowFunctionExpression']: + if node.get('id', {}).get('name') == function_name: + return node.get('start'), node.get('end') + + # Recursive traversal + for key, value in node.items(): + if isinstance(value, dict): + result = find_function(value, function_name) + if result: + return result + elif isinstance(value, list): + for item in value: + result = find_function(item, function_name) + if result: + return result + return None + + # Search for the function in the AST + start_end = find_function(ast['program'], function_name) # Pass function_name here + if start_end: + start, end = start_end + return file_content[start:end] + except Exception as e: + print(f"Error processing AST: {e}") return None functions = {} + + + for commit in merge_commits: - parent_commit = commit.parents[0] - diffs = commit.diff(parent_commit, create_patch=True) - - for diff in diffs: - diff_content = diff.diff.decode('utf-8') - for func_name, _ in get_func_name(diff_content): - full_function = get_full_function_at_commit(repo, commit.hexsha, func_name, diff.a_path) - if full_function: - func_key = f"{diff.a_path}::{func_name}" - if func_key not in functions: - functions[func_key] = { - 'function_name': func_name, - 'merged_function': full_function, - 'commit': commit.hexsha, - 'changes_after_merge': 0, - 'latest_function': full_function, - 'time_first_merged': commit.authored_datetime, - 'file_path': diff.a_path - } - - - for func_key, func_info in functions.items(): - for commit in repo.iter_commits('main', reverse=True): # Iterate from the oldest to newest - if commit.authored_datetime > func_info['time_first_merged']: + for file_path in commit.stats.files: + if file_path.endswith('.js'): + try: + blob = commit.tree / file_path + file_content = blob.data_stream.read().decode('utf-8') + for func_name in get_functions_from_file(file_content): + full_function = get_full_function_at_commit(repo, commit.hexsha, func_name, file_path) + if full_function: + func_key = f"{file_path}::{func_name}" + if func_key not in functions: + functions[func_key] = { + 'function_name': func_name, + 'merged_function': full_function, + 'commit': commit.hexsha, + 'changes_after_merge': 0, + 'latest_function': full_function, + 'time_first_merged': commit.authored_datetime, + 'file_path': file_path + } + except Exception as e: + print(f"Error processing commit {commit.hexsha}: {e}") + continue + + for commit in repo.iter_commits('main', reverse=True): # Iterate from the oldest to newest commit + for file_path in commit.stats.files: + if file_path.endswith('.js'): try: - blob = commit.tree / func_info['file_path'] + blob = commit.tree / file_path file_content = blob.data_stream.read().decode('utf-8') - new_content = get_full_function_at_commit(repo, commit.hexsha, func_info['function_name'], func_info['file_path']) - if new_content and new_content.strip() != func_info['latest_function'].strip(): - func_info['changes_after_merge'] += 1 - func_info['latest_function'] = new_content - except KeyError: + current_functions = get_functions_from_file(file_content) + + for func_key, func_info in functions.items(): + if func_info['file_path'] == file_path: + if func_info['function_name'] in current_functions: + new_content = get_full_function_at_commit(repo, commit.hexsha, func_info['function_name'], file_path) + if new_content and new_content.strip() != func_info['latest_function'].strip() and commit.authored_datetime > func_info['time_first_merged']: + func_info['changes_after_merge'] += 1 + func_info['latest_function'] = new_content + except Exception as e: + print(f"Error processing commit {commit.hexsha}: {e}") continue - # Find the min and max changes after merge - min_changes = min(functions.values(), key=lambda x: x['changes_after_merge'])['changes_after_merge'] - max_changes = max(functions.values(), key=lambda x: x['changes_after_merge'])['changes_after_merge'] - - # Normalize the change counts between -1 and 1 - for func_key, func_info in functions.items(): - if max_changes != min_changes: - normalized_score = 2 * ((func_info['changes_after_merge'] - min_changes) / (max_changes - min_changes)) - 1 - else: - normalized_score = 0 - func_info['score'] = normalized_score + # Normalize the change counts to a score between -1 and 1 + functions = normalize_change_counts(functions) # Convert datetime objects to string before saving for func in functions.values(): @@ -97,5 +207,9 @@ def get_full_function_at_commit(repo, commit_hash, function_name, file_path): json.dump(functions, f, indent=4) if __name__ == '__main__': - # pass repo_path variable if you want to test on another repo other than default - get_function_data() \ No newline at end of file + start_time = time.time() + get_function_data() #pass this variable if you want to run another repo than testRepo: repo_path='../inputData/elixirsolutions' + end_time = time.time() + elapsed_time = round((end_time - start_time) / 60, 2) # convert to minutes and round to 2 decimal places + print('✅ Printed function data to outputData/test_function_changes.json ✅') + print(f'⏰ The program took {elapsed_time} minutes to run. ⏰') \ No newline at end of file diff --git a/main.py b/main.py index 8f8f4e0..6d32ea3 100644 --- a/main.py +++ b/main.py @@ -4,19 +4,23 @@ import createEmbeddings from userInput import processUserInput -# Run the main function from getFunctionData -# pass this variable if you want to run another repo than testRepo2: -repo_path='../inputData/elixirsolutions' -getFunctionData.get_function_data(repo_path) +def main(repo_path='../inputData/testRepo'): + # Run the main function from getFunctionData + getFunctionData.get_function_data(repo_path) -# Run the main function from createEmbeddings -createEmbeddings.embed_sample_functions() + # Run the main function from createEmbeddings + createEmbeddings.embed_sample_functions(repo_path) -# Run the main function from processUserInput -processUserInput.process_user_input() + # Run the main function from processUserInput + processUserInput.process_user_input() -# Create a test suite -suite = unittest.TestLoader().loadTestsFromModule(test_get_function_data) + # Create a test suite + suite = unittest.TestLoader().loadTestsFromModule(test_get_function_data) -# Run the tests with CustomTestRunner -test_get_function_data.CustomTestRunner().run(suite) \ No newline at end of file + # Run the tests with CustomTestRunner + test_get_function_data.CustomTestRunner().run(suite) + +if __name__ == '__main__': + # pass this variable if you want to run another repo than testRepo: + # repo_path='../inputData/elixirsolutions' + main() \ No newline at end of file diff --git a/package.json b/package.json new file mode 100644 index 0000000..c8870bf --- /dev/null +++ b/package.json @@ -0,0 +1,17 @@ +{ + "name": "functionretriever", + "version": "1.0.0", + "description": "- How do I run the database locally? - install docker - install qdrant - run qdrant", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "keywords": [], + "author": "", + "license": "ISC", + "dependencies": { + "@babel/core": "^7.23.7", + "@babel/parser": "^7.23.6", + "babel": "^6.23.0" + } +} diff --git a/test_get_function_data.py b/test_get_function_data.py index 1968e98..66c65c1 100644 --- a/test_get_function_data.py +++ b/test_get_function_data.py @@ -18,10 +18,10 @@ def setUpClass(cls): def test_createdWithMergeAndNotChangedAfter(self): # Define the expected function key and content - expected_key = 'blocks/tests.js::createdWithMergeAndNotChangedAfter' + expected_key = 'blocks/test.js::createdWithMergeAndNotChangedAfter' expected_function = { 'function_name': 'createdWithMergeAndNotChangedAfter', - 'merged_function': 'function createdWithMergeAndNotChangedAfter() {\n This is the first change\n This is the second change\n}', + 'merged_function': "function createdWithMergeAndNotChangedAfter() {\n console.log('This creates the function on test branch')\n console.log('Second change on test branch')\n}", 'changes_after_merge': 0 } @@ -37,10 +37,10 @@ def test_createdWithMergeAndNotChangedAfter(self): def test_CreatedOnMainAndNotChangedAfterMerge(self): # Define the expected function key and content - expected_key = 'blocks/tests.js::CreatedOnMainAndNotChangedAfterMerge' + expected_key = 'blocks/test.js::CreatedOnMainAndNotChangedAfterMerge' expected_function = { 'function_name': 'CreatedOnMainAndNotChangedAfterMerge', - 'merged_function': 'function CreatedOnMainAndNotChangedAfterMerge() {\n this is the first change\n this is the second change\n this is the third change\n}', + 'merged_function': "function CreatedOnMainAndNotChangedAfterMerge() {\n console.log('This creates the function on main branch')\n console.log('First change on test branch')\n console.log('Second change on test branch')\n}", 'changes_after_merge': 0 } @@ -50,12 +50,13 @@ def test_CreatedOnMainAndNotChangedAfterMerge(self): self.assertEqual(self.function_data[expected_key]['merged_function'].strip(), expected_function['merged_function'].strip()) self.assertEqual(self.function_data[expected_key]['changes_after_merge'], expected_function['changes_after_merge']) + def test_createdWithMergeAndChangedAfterMerge(self): # Define the expected function key and content - expected_key = 'blocks/tests.js::createdWithMergeAndChangedAfterMerge' + expected_key = 'blocks/test.js::createdWithMergeAndChangedAfterMerge' expected_function = { 'function_name': 'createdWithMergeAndChangedAfterMerge', - 'merged_function': 'function createdWithMergeAndChangedAfterMerge() {\n this is the first change\n this is the second change\n}', + 'merged_function': "function createdWithMergeAndChangedAfterMerge() {\n console.log('This creates the function on test branch')\n console.log('second change on test branch')\n}", 'changes_after_merge': 1 } @@ -64,13 +65,13 @@ def test_createdWithMergeAndChangedAfterMerge(self): self.assertEqual(self.function_data[expected_key]['function_name'], expected_function['function_name']) self.assertEqual(self.function_data[expected_key]['merged_function'].strip(), expected_function['merged_function'].strip()) self.assertEqual(self.function_data[expected_key]['changes_after_merge'], expected_function['changes_after_merge']) - - def test_createdOnMainAndChangedAfterMerge(self): + + def test_createdOnMainAndChangedAfterWithMerge(self): # Define the expected function key and content - expected_key = 'blocks/tests.js::createdOnMainAndChangedAfterMerge' + expected_key = 'blocks/test.js::createdOnMainAndChangedAfterWithMerge' expected_function = { - 'function_name': 'createdOnMainAndChangedAfterMerge', - 'merged_function': 'function createdOnMainAndChangedAfterMerge() {\n first change on main\n change on test branch\n second change on test branch\n}', + 'function_name': 'createdOnMainAndChangedAfterWithMerge', + 'merged_function': "function createdOnMainAndChangedAfterWithMerge() {\n console.log('This creates the function on main branch')\n console.log('First change on test branch')\n console.log('Second change on test branch')\n}", 'changes_after_merge': 1 } @@ -79,13 +80,13 @@ def test_createdOnMainAndChangedAfterMerge(self): self.assertEqual(self.function_data[expected_key]['function_name'], expected_function['function_name']) self.assertEqual(self.function_data[expected_key]['merged_function'].strip(), expected_function['merged_function'].strip()) self.assertEqual(self.function_data[expected_key]['changes_after_merge'], expected_function['changes_after_merge']) - + def test_changedAfterMergeWithMerge(self): # Define the expected function key and content - expected_key = 'blocks/tests.js::changedAfterMergeWithMerge' + expected_key = 'blocks/test.js::test_changedAfterMergeWithMerge' expected_function = { - 'function_name': 'changedAfterMergeWithMerge', - 'merged_function': 'function changedAfterMergeWithMerge() {\n this is the first change\n this is the second change\n}', + 'function_name': 'test_changedAfterMergeWithMerge', + 'merged_function': "function test_changedAfterMergeWithMerge() {\n console.log('This creates the function')\n console.log('Second change on test branch')\n}", 'changes_after_merge': 2 } @@ -94,7 +95,7 @@ def test_changedAfterMergeWithMerge(self): self.assertEqual(self.function_data[expected_key]['function_name'], expected_function['function_name']) self.assertEqual(self.function_data[expected_key]['merged_function'].strip(), expected_function['merged_function'].strip()) self.assertEqual(self.function_data[expected_key]['changes_after_merge'], expected_function['changes_after_merge']) - + class CustomTestRunner(unittest.TextTestRunner): def run(self, test): result = super(CustomTestRunner, self).run(test)