sagarnildass commited on
Commit
13e0903
·
verified ·
1 Parent(s): 6f509ec

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. __pycache__/models.cpython-310.pyc +0 -0
  2. models.py +15 -3
  3. requirements.txt +16 -12
__pycache__/models.cpython-310.pyc CHANGED
Binary files a/__pycache__/models.cpython-310.pyc and b/__pycache__/models.cpython-310.pyc differ
 
models.py CHANGED
@@ -8,7 +8,7 @@ import tldextract
8
  from urllib.parse import urlparse, urljoin, urlunparse
9
  from datetime import datetime
10
  from typing import Dict, List, Any, Optional, Set, Tuple
11
- from pydantic import BaseModel, Field, HttpUrl, validator
12
  from enum import Enum
13
  import logging
14
 
@@ -50,14 +50,14 @@ class URL(BaseModel):
50
  error: Optional[str] = None # Error message if failed
51
  metadata: Dict[str, Any] = Field(default_factory=dict) # Additional metadata
52
 
53
- @validator("normalized_url", pre=True, always=True)
54
  def set_normalized_url(cls, v, values):
55
  """Normalize the URL if not already set"""
56
  if not v and "url" in values:
57
  return normalize_url(values["url"])
58
  return v
59
 
60
- @validator("domain", pre=True, always=True)
61
  def set_domain(cls, v, values):
62
  """Extract domain from URL if not already set"""
63
  if not v and "url" in values:
@@ -65,6 +65,9 @@ class URL(BaseModel):
65
  return f"{parsed.domain}.{parsed.suffix}" if parsed.suffix else parsed.domain
66
  return v
67
 
 
 
 
68
 
69
  class RobotsInfo(BaseModel):
70
  """Information from robots.txt for a domain"""
@@ -75,6 +78,9 @@ class RobotsInfo(BaseModel):
75
  user_agents: Dict[str, Dict[str, Any]] = Field(default_factory=dict) # Info per user agent
76
  status_code: Optional[int] = None # HTTP status code when fetching robots.txt
77
 
 
 
 
78
 
79
  class Page(BaseModel):
80
  """Web page model with content and metadata"""
@@ -92,6 +98,9 @@ class Page(BaseModel):
92
  is_duplicate: bool = False # Whether this is duplicate content
93
  metadata: Dict[str, Any] = Field(default_factory=dict) # Additional metadata
94
 
 
 
 
95
 
96
  class DomainStats(BaseModel):
97
  """Statistics for a domain"""
@@ -104,6 +113,9 @@ class DomainStats(BaseModel):
104
  crawl_times: List[float] = Field(default_factory=list) # Recent crawl times
105
  errors: Dict[int, int] = Field(default_factory=dict) # Status code counts for errors
106
 
 
 
 
107
 
108
  def normalize_url(url: str) -> str:
109
  """
 
8
  from urllib.parse import urlparse, urljoin, urlunparse
9
  from datetime import datetime
10
  from typing import Dict, List, Any, Optional, Set, Tuple
11
+ from pydantic import BaseModel, Field, HttpUrl, field_validator
12
  from enum import Enum
13
  import logging
14
 
 
50
  error: Optional[str] = None # Error message if failed
51
  metadata: Dict[str, Any] = Field(default_factory=dict) # Additional metadata
52
 
53
+ @field_validator("normalized_url", mode="before")
54
  def set_normalized_url(cls, v, values):
55
  """Normalize the URL if not already set"""
56
  if not v and "url" in values:
57
  return normalize_url(values["url"])
58
  return v
59
 
60
+ @field_validator("domain", mode="before")
61
  def set_domain(cls, v, values):
62
  """Extract domain from URL if not already set"""
63
  if not v and "url" in values:
 
65
  return f"{parsed.domain}.{parsed.suffix}" if parsed.suffix else parsed.domain
66
  return v
67
 
68
+ class Config:
69
+ arbitrary_types_allowed = True
70
+
71
 
72
  class RobotsInfo(BaseModel):
73
  """Information from robots.txt for a domain"""
 
78
  user_agents: Dict[str, Dict[str, Any]] = Field(default_factory=dict) # Info per user agent
79
  status_code: Optional[int] = None # HTTP status code when fetching robots.txt
80
 
81
+ class Config:
82
+ arbitrary_types_allowed = True
83
+
84
 
85
  class Page(BaseModel):
86
  """Web page model with content and metadata"""
 
98
  is_duplicate: bool = False # Whether this is duplicate content
99
  metadata: Dict[str, Any] = Field(default_factory=dict) # Additional metadata
100
 
101
+ class Config:
102
+ arbitrary_types_allowed = True
103
+
104
 
105
  class DomainStats(BaseModel):
106
  """Statistics for a domain"""
 
113
  crawl_times: List[float] = Field(default_factory=list) # Recent crawl times
114
  errors: Dict[int, int] = Field(default_factory=dict) # Status code counts for errors
115
 
116
+ class Config:
117
+ arbitrary_types_allowed = True
118
+
119
 
120
  def normalize_url(url: str) -> str:
121
  """
requirements.txt CHANGED
@@ -1,24 +1,24 @@
1
  # Core dependencies
2
- requests==2.31.0
3
- beautifulsoup4==4.12.3
4
- aiohttp==3.9.3
5
  lxml==4.9.2
6
  html5lib==1.1
7
- pydantic==1.10.7
8
- pymongo==4.6.1
9
- redis==5.0.1
10
  boto3==1.26.123
11
  docopt==0.6.2
12
 
13
  # URL and DNS handling
14
  dnspython==2.3.0
15
- tldextract==5.1.1
16
  validators==0.20.0
17
  robotexclusionrulesparser==1.7.1
18
  urllib3==1.26.15
19
 
20
  # Monitoring and metrics
21
- prometheus-client==0.16.0
22
 
23
  # HTML processing
24
  html2text==2020.1.16
@@ -28,16 +28,20 @@ anyio==3.6.2
28
  asyncio==3.4.3
29
 
30
  # Utilities
31
- python-dateutil==2.8.2
32
  pytz==2023.3
33
  retry==0.9.2
34
  cryptography==40.0.1
35
  cachetools==5.3.0
36
 
37
  # Added from the code block
38
- openai==1.12.0
39
- gradio==4.16.0
40
  chardet==5.2.0
41
 
42
  # Dotenv
43
- python-dotenv
 
 
 
 
 
1
  # Core dependencies
2
+ requests>=2.31.0
3
+ beautifulsoup4>=4.12.0
4
+ aiohttp>=3.9.0
5
  lxml==4.9.2
6
  html5lib==1.1
7
+ pydantic>=2.0,<3.0
8
+ pymongo>=4.6.0
9
+ redis>=5.0.0
10
  boto3==1.26.123
11
  docopt==0.6.2
12
 
13
  # URL and DNS handling
14
  dnspython==2.3.0
15
+ tldextract>=5.1.1
16
  validators==0.20.0
17
  robotexclusionrulesparser==1.7.1
18
  urllib3==1.26.15
19
 
20
  # Monitoring and metrics
21
+ prometheus-client>=0.19.0
22
 
23
  # HTML processing
24
  html2text==2020.1.16
 
28
  asyncio==3.4.3
29
 
30
  # Utilities
31
+ python-dateutil>=2.8.2
32
  pytz==2023.3
33
  retry==0.9.2
34
  cryptography==40.0.1
35
  cachetools==5.3.0
36
 
37
  # Added from the code block
38
+ openai>=1.12.0
39
+ gradio>=4.16.0
40
  chardet==5.2.0
41
 
42
  # Dotenv
43
+ python-dotenv>=1.0.0
44
+
45
+ # New dependencies
46
+ mmh3>=4.0.0
47
+ httpx>=0.26.0