Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- __pycache__/models.cpython-310.pyc +0 -0
- models.py +15 -3
- requirements.txt +16 -12
__pycache__/models.cpython-310.pyc
CHANGED
Binary files a/__pycache__/models.cpython-310.pyc and b/__pycache__/models.cpython-310.pyc differ
|
|
models.py
CHANGED
@@ -8,7 +8,7 @@ import tldextract
|
|
8 |
from urllib.parse import urlparse, urljoin, urlunparse
|
9 |
from datetime import datetime
|
10 |
from typing import Dict, List, Any, Optional, Set, Tuple
|
11 |
-
from pydantic import BaseModel, Field, HttpUrl,
|
12 |
from enum import Enum
|
13 |
import logging
|
14 |
|
@@ -50,14 +50,14 @@ class URL(BaseModel):
|
|
50 |
error: Optional[str] = None # Error message if failed
|
51 |
metadata: Dict[str, Any] = Field(default_factory=dict) # Additional metadata
|
52 |
|
53 |
-
@
|
54 |
def set_normalized_url(cls, v, values):
|
55 |
"""Normalize the URL if not already set"""
|
56 |
if not v and "url" in values:
|
57 |
return normalize_url(values["url"])
|
58 |
return v
|
59 |
|
60 |
-
@
|
61 |
def set_domain(cls, v, values):
|
62 |
"""Extract domain from URL if not already set"""
|
63 |
if not v and "url" in values:
|
@@ -65,6 +65,9 @@ class URL(BaseModel):
|
|
65 |
return f"{parsed.domain}.{parsed.suffix}" if parsed.suffix else parsed.domain
|
66 |
return v
|
67 |
|
|
|
|
|
|
|
68 |
|
69 |
class RobotsInfo(BaseModel):
|
70 |
"""Information from robots.txt for a domain"""
|
@@ -75,6 +78,9 @@ class RobotsInfo(BaseModel):
|
|
75 |
user_agents: Dict[str, Dict[str, Any]] = Field(default_factory=dict) # Info per user agent
|
76 |
status_code: Optional[int] = None # HTTP status code when fetching robots.txt
|
77 |
|
|
|
|
|
|
|
78 |
|
79 |
class Page(BaseModel):
|
80 |
"""Web page model with content and metadata"""
|
@@ -92,6 +98,9 @@ class Page(BaseModel):
|
|
92 |
is_duplicate: bool = False # Whether this is duplicate content
|
93 |
metadata: Dict[str, Any] = Field(default_factory=dict) # Additional metadata
|
94 |
|
|
|
|
|
|
|
95 |
|
96 |
class DomainStats(BaseModel):
|
97 |
"""Statistics for a domain"""
|
@@ -104,6 +113,9 @@ class DomainStats(BaseModel):
|
|
104 |
crawl_times: List[float] = Field(default_factory=list) # Recent crawl times
|
105 |
errors: Dict[int, int] = Field(default_factory=dict) # Status code counts for errors
|
106 |
|
|
|
|
|
|
|
107 |
|
108 |
def normalize_url(url: str) -> str:
|
109 |
"""
|
|
|
8 |
from urllib.parse import urlparse, urljoin, urlunparse
|
9 |
from datetime import datetime
|
10 |
from typing import Dict, List, Any, Optional, Set, Tuple
|
11 |
+
from pydantic import BaseModel, Field, HttpUrl, field_validator
|
12 |
from enum import Enum
|
13 |
import logging
|
14 |
|
|
|
50 |
error: Optional[str] = None # Error message if failed
|
51 |
metadata: Dict[str, Any] = Field(default_factory=dict) # Additional metadata
|
52 |
|
53 |
+
@field_validator("normalized_url", mode="before")
|
54 |
def set_normalized_url(cls, v, values):
|
55 |
"""Normalize the URL if not already set"""
|
56 |
if not v and "url" in values:
|
57 |
return normalize_url(values["url"])
|
58 |
return v
|
59 |
|
60 |
+
@field_validator("domain", mode="before")
|
61 |
def set_domain(cls, v, values):
|
62 |
"""Extract domain from URL if not already set"""
|
63 |
if not v and "url" in values:
|
|
|
65 |
return f"{parsed.domain}.{parsed.suffix}" if parsed.suffix else parsed.domain
|
66 |
return v
|
67 |
|
68 |
+
class Config:
|
69 |
+
arbitrary_types_allowed = True
|
70 |
+
|
71 |
|
72 |
class RobotsInfo(BaseModel):
|
73 |
"""Information from robots.txt for a domain"""
|
|
|
78 |
user_agents: Dict[str, Dict[str, Any]] = Field(default_factory=dict) # Info per user agent
|
79 |
status_code: Optional[int] = None # HTTP status code when fetching robots.txt
|
80 |
|
81 |
+
class Config:
|
82 |
+
arbitrary_types_allowed = True
|
83 |
+
|
84 |
|
85 |
class Page(BaseModel):
|
86 |
"""Web page model with content and metadata"""
|
|
|
98 |
is_duplicate: bool = False # Whether this is duplicate content
|
99 |
metadata: Dict[str, Any] = Field(default_factory=dict) # Additional metadata
|
100 |
|
101 |
+
class Config:
|
102 |
+
arbitrary_types_allowed = True
|
103 |
+
|
104 |
|
105 |
class DomainStats(BaseModel):
|
106 |
"""Statistics for a domain"""
|
|
|
113 |
crawl_times: List[float] = Field(default_factory=list) # Recent crawl times
|
114 |
errors: Dict[int, int] = Field(default_factory=dict) # Status code counts for errors
|
115 |
|
116 |
+
class Config:
|
117 |
+
arbitrary_types_allowed = True
|
118 |
+
|
119 |
|
120 |
def normalize_url(url: str) -> str:
|
121 |
"""
|
requirements.txt
CHANGED
@@ -1,24 +1,24 @@
|
|
1 |
# Core dependencies
|
2 |
-
requests
|
3 |
-
beautifulsoup4
|
4 |
-
aiohttp
|
5 |
lxml==4.9.2
|
6 |
html5lib==1.1
|
7 |
-
pydantic
|
8 |
-
pymongo
|
9 |
-
redis
|
10 |
boto3==1.26.123
|
11 |
docopt==0.6.2
|
12 |
|
13 |
# URL and DNS handling
|
14 |
dnspython==2.3.0
|
15 |
-
tldextract
|
16 |
validators==0.20.0
|
17 |
robotexclusionrulesparser==1.7.1
|
18 |
urllib3==1.26.15
|
19 |
|
20 |
# Monitoring and metrics
|
21 |
-
prometheus-client
|
22 |
|
23 |
# HTML processing
|
24 |
html2text==2020.1.16
|
@@ -28,16 +28,20 @@ anyio==3.6.2
|
|
28 |
asyncio==3.4.3
|
29 |
|
30 |
# Utilities
|
31 |
-
python-dateutil
|
32 |
pytz==2023.3
|
33 |
retry==0.9.2
|
34 |
cryptography==40.0.1
|
35 |
cachetools==5.3.0
|
36 |
|
37 |
# Added from the code block
|
38 |
-
openai
|
39 |
-
gradio
|
40 |
chardet==5.2.0
|
41 |
|
42 |
# Dotenv
|
43 |
-
python-dotenv
|
|
|
|
|
|
|
|
|
|
1 |
# Core dependencies
|
2 |
+
requests>=2.31.0
|
3 |
+
beautifulsoup4>=4.12.0
|
4 |
+
aiohttp>=3.9.0
|
5 |
lxml==4.9.2
|
6 |
html5lib==1.1
|
7 |
+
pydantic>=2.0,<3.0
|
8 |
+
pymongo>=4.6.0
|
9 |
+
redis>=5.0.0
|
10 |
boto3==1.26.123
|
11 |
docopt==0.6.2
|
12 |
|
13 |
# URL and DNS handling
|
14 |
dnspython==2.3.0
|
15 |
+
tldextract>=5.1.1
|
16 |
validators==0.20.0
|
17 |
robotexclusionrulesparser==1.7.1
|
18 |
urllib3==1.26.15
|
19 |
|
20 |
# Monitoring and metrics
|
21 |
+
prometheus-client>=0.19.0
|
22 |
|
23 |
# HTML processing
|
24 |
html2text==2020.1.16
|
|
|
28 |
asyncio==3.4.3
|
29 |
|
30 |
# Utilities
|
31 |
+
python-dateutil>=2.8.2
|
32 |
pytz==2023.3
|
33 |
retry==0.9.2
|
34 |
cryptography==40.0.1
|
35 |
cachetools==5.3.0
|
36 |
|
37 |
# Added from the code block
|
38 |
+
openai>=1.12.0
|
39 |
+
gradio>=4.16.0
|
40 |
chardet==5.2.0
|
41 |
|
42 |
# Dotenv
|
43 |
+
python-dotenv>=1.0.0
|
44 |
+
|
45 |
+
# New dependencies
|
46 |
+
mmh3>=4.0.0
|
47 |
+
httpx>=0.26.0
|