11import os
22import yaml
3+ import fnmatch
34from pocketflow import Node , BatchNode
45from utils .crawl_github_files import crawl_github_files
56from utils .call_llm import call_llm # Assuming you have this utility
67
8+ def crawl_local_files (directory , include_patterns = None , exclude_patterns = None , max_file_size = None , use_relative_paths = True ):
9+ """
10+ Crawl files in a local directory with similar interface as crawl_github_files.
11+
12+ Args:
13+ directory (str): Path to local directory
14+ include_patterns (set): File patterns to include (e.g. {"*.py", "*.js"})
15+ exclude_patterns (set): File patterns to exclude (e.g. {"tests/*"})
16+ max_file_size (int): Maximum file size in bytes
17+ use_relative_paths (bool): Whether to use paths relative to directory
18+
19+ Returns:
20+ dict: {"files": {filepath: content}}
21+ """
22+ if not os .path .isdir (directory ):
23+ raise ValueError (f"Directory does not exist: { directory } " )
24+
25+ files_dict = {}
26+
27+ for root , _ , files in os .walk (directory ):
28+ for filename in files :
29+ filepath = os .path .join (root , filename )
30+
31+ # Get path relative to directory if requested
32+ if use_relative_paths :
33+ relpath = os .path .relpath (filepath , directory )
34+ else :
35+ relpath = filepath
36+
37+ # Check if file matches any include pattern
38+ included = False
39+ if include_patterns :
40+ for pattern in include_patterns :
41+ if fnmatch .fnmatch (relpath , pattern ):
42+ included = True
43+ break
44+ else :
45+ included = True
46+
47+ # Check if file matches any exclude pattern
48+ excluded = False
49+ if exclude_patterns :
50+ for pattern in exclude_patterns :
51+ if fnmatch .fnmatch (relpath , pattern ):
52+ excluded = True
53+ break
54+
55+ if not included or excluded :
56+ continue
57+
58+ # Check file size
59+ if max_file_size and os .path .getsize (filepath ) > max_file_size :
60+ continue
61+
62+ try :
63+ with open (filepath , 'r' , encoding = 'utf-8' ) as f :
64+ content = f .read ()
65+ files_dict [relpath ] = content
66+ except Exception as e :
67+ print (f"Warning: Could not read file { filepath } : { e } " )
68+
69+ return {"files" : files_dict }
70+
771# Helper to create context from files, respecting limits (basic example)
872def create_llm_context (files_data ):
973 context = ""
@@ -26,20 +90,26 @@ def get_content_for_indices(files_data, indices):
2690
2791class FetchRepo (Node ):
2892 def prep (self , shared ):
29- repo_url = shared ["repo_url" ]
93+ repo_url = shared .get ("repo_url" )
94+ local_dir = shared .get ("local_dir" )
3095 project_name = shared .get ("project_name" )
96+
3197 if not project_name :
32- # Basic name derivation from URL
33- project_name = repo_url .split ('/' )[- 1 ].replace ('.git' , '' )
98+ # Basic name derivation from URL or directory
99+ if repo_url :
100+ project_name = repo_url .split ('/' )[- 1 ].replace ('.git' , '' )
101+ else :
102+ project_name = os .path .basename (os .path .abspath (local_dir ))
34103 shared ["project_name" ] = project_name
35104
36- # Get file patterns directly from shared (defaults are defined in main.py)
105+ # Get file patterns directly from shared
37106 include_patterns = shared ["include_patterns" ]
38107 exclude_patterns = shared ["exclude_patterns" ]
39108 max_file_size = shared ["max_file_size" ]
40109
41110 return {
42111 "repo_url" : repo_url ,
112+ "local_dir" : local_dir ,
43113 "token" : shared .get ("github_token" ),
44114 "include_patterns" : include_patterns ,
45115 "exclude_patterns" : exclude_patterns ,
@@ -48,15 +118,26 @@ def prep(self, shared):
48118 }
49119
50120 def exec (self , prep_res ):
51- print (f"Crawling repository: { prep_res ['repo_url' ]} ..." )
52- result = crawl_github_files (
53- repo_url = prep_res ["repo_url" ],
54- token = prep_res ["token" ],
55- include_patterns = prep_res ["include_patterns" ],
56- exclude_patterns = prep_res ["exclude_patterns" ],
57- max_file_size = prep_res ["max_file_size" ],
58- use_relative_paths = prep_res ["use_relative_paths" ]
59- )
121+ if prep_res ["repo_url" ]:
122+ print (f"Crawling repository: { prep_res ['repo_url' ]} ..." )
123+ result = crawl_github_files (
124+ repo_url = prep_res ["repo_url" ],
125+ token = prep_res ["token" ],
126+ include_patterns = prep_res ["include_patterns" ],
127+ exclude_patterns = prep_res ["exclude_patterns" ],
128+ max_file_size = prep_res ["max_file_size" ],
129+ use_relative_paths = prep_res ["use_relative_paths" ]
130+ )
131+ else :
132+ print (f"Crawling directory: { prep_res ['local_dir' ]} ..." )
133+ result = crawl_local_files (
134+ directory = prep_res ["local_dir" ],
135+ include_patterns = prep_res ["include_patterns" ],
136+ exclude_patterns = prep_res ["exclude_patterns" ],
137+ max_file_size = prep_res ["max_file_size" ],
138+ use_relative_paths = prep_res ["use_relative_paths" ]
139+ )
140+
60141 # Convert dict to list of tuples: [(path, content), ...]
61142 files_list = list (result .get ("files" , {}).items ())
62143 print (f"Fetched { len (files_list )} files." )
0 commit comments