Spaces:

MicroHealth
/

git-license-extractor

Paused

App Files Files Community

bluenevus commited on Apr 13

Commit

1a646d6

verified ·

1 Parent(s): 00d9616

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -76

app.py CHANGED Viewed

@@ -3,97 +3,45 @@ import google.generativeai as genai
 import requests
 import base64
 import json
-from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
 def fetch_github_files(github_url, personal_access_token):
-    try:
-        # Parse the GitHub URL
-        parts = github_url.split('/')
-        owner = parts[3]
-        repo = parts[4].split('.git')[0]
-        branch = 'main'  # You might want to make this configurable
-        # List of common dependency files to look for
-        dependency_files = [
-            'requirements.txt',
-            'package.json',
-            'Gemfile',
-            'pom.xml',
-            'build.gradle',
-            'composer.json',
-            'Cargo.toml',
-            'go.mod',
-            'Pipfile'
-        ]
-        all_content = ""
-        # Set up headers with the personal access token
-        headers = {
-            "Authorization": f"token {personal_access_token}",
-            "Accept": "application/vnd.github.v3+json"
-        }
-        for file_path in dependency_files:
-            # Construct the API URL
-            api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{file_path}?ref={branch}"
-            # Make the API request
-            response = requests.get(api_url, headers=headers)
-            if response.status_code == 200:
-                content = response.json()
-                if isinstance(content, dict) and 'content' in content:
-                    # This is a file
-                    file_content = base64.b64decode(content['content']).decode('utf-8')
-                    all_content += f"\n\n--- {file_path} ---\n{file_content}"
-                else:
-                    # This is a directory or something else, skip it
-                    continue
-        if not all_content:
-            return "Error: No dependency files found in the repository."
-        return all_content
-    except requests.exceptions.RequestException as e:
-        return f"Error accessing GitHub: {str(e)}"
-    except json.JSONDecodeError:
-        return f"Error: Unable to parse GitHub API response for {file_path}"
-@retry(
-    stop=stop_after_attempt(3),
-    wait=wait_exponential(multiplier=1, min=4, max=10),
-    retry=retry_if_exception_type(Exception),
-    reraise=True
-)
-def process_with_gemini(file_content, gemini_api_key):
     genai.configure(api_key=gemini_api_key)
     model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
-    # Limit content size (adjust the limit as needed)
-    max_content_length = 10000  # characters
-    if len(file_content) > max_content_length:
-        file_content = file_content[:max_content_length] + "..."
     prompt = f"""
     Analyze the following file content for open-source license information:
-    {file_content}
     Please provide:
-    1. A numbered with the name dependency and version as the title
-    2. 1st bullet under title has a brief summary of what the depency does
-    3. 2nd bullet under title has the license name apache 2.0
     4. 3rd bullet under title has a hyperlink to the license file
     5. Provide no other information such as greeting or summary as the purpose is to catalog and document all open source licenses used.
     """
-    try:
-        response = model.generate_content(prompt, timeout=60)  # Set a timeout of 60 seconds
-        return response.text
-    except Exception as e:
-        print(f"Error in Gemini API call: {str(e)}")
-        raise ValueError(f"Gemini API error: {str(e)}")
 def process_input(github_url, personal_access_token, gemini_api_key):
     if not github_url.startswith("https://github.com/"):

 import requests
 import base64
 import json
+from tenacity import retry, stop_after_attempt, wait_fixed
 def fetch_github_files(github_url, personal_access_token):
+    # ... (keep this function as is) ...
+def process_chunk_with_gemini(chunk, gemini_api_key):
     genai.configure(api_key=gemini_api_key)
     model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
     prompt = f"""
     Analyze the following file content for open-source license information:
+    {chunk}
     Please provide:
+    1. A numbered list with the name dependency and version as the title
+    2. 1st bullet under title has a brief summary of what the dependency does
+    3. 2nd bullet under title has the license name
     4. 3rd bullet under title has a hyperlink to the license file
     5. Provide no other information such as greeting or summary as the purpose is to catalog and document all open source licenses used.
     """
+    response = model.generate_content(prompt)
+    return response.text
+@retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
+def process_with_gemini(file_content, gemini_api_key):
+    # Split the content into chunks of approximately 4000 characters
+    chunk_size = 4000
+    chunks = [file_content[i:i+chunk_size] for i in range(0, len(file_content), chunk_size)]
+    results = []
+    for chunk in chunks:
+        result = process_chunk_with_gemini(chunk, gemini_api_key)
+        results.append(result)
+    # Combine the results
+    combined_result = "\n\n".join(results)
+    return combined_result
 def process_input(github_url, personal_access_token, gemini_api_key):
     if not github_url.startswith("https://github.com/"):