Tesneem commited on
Commit
25d9750
·
verified ·
1 Parent(s): 08637b4

Update document_chunker.py

Browse files
Files changed (1) hide show
  1. document_chunker.py +18 -22
document_chunker.py CHANGED
@@ -7,6 +7,7 @@ from dataclasses import dataclass
7
  from docx import Document
8
  from sentence_transformers import SentenceTransformer
9
  from sklearn.feature_extraction.text import TfidfVectorizer
 
10
 
11
 
12
  @dataclass
@@ -56,37 +57,20 @@ class DocumentChunker:
56
  }
57
  }
58
 
59
- def match_category(self, text: str, return_first: bool = True) -> Optional[str] or List[str]:
60
- lower_text = text.lower()
61
- match_scores = defaultdict(int)
62
- for category, patterns in self.category_patterns.items():
63
- for pattern in patterns:
64
- matches = re.findall(pattern, lower_text)
65
- match_scores[category] += len(matches)
66
-
67
- if not match_scores:
68
- return None if return_first else []
69
-
70
- sorted_categories = sorted(match_scores.items(), key=lambda x: -x[1])
71
- return sorted_categories[0][0] if return_first else [cat for cat, _ in sorted_categories if match_scores[cat] > 0]
72
-
73
- # def extract_text_from_docx(self, file_path: str) -> str:
74
- # doc = Document(file_path)
75
- # return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
76
  def extract_text(self, file_path: str) -> str:
77
  if file_path.endswith(".docx"):
78
  doc = Document(file_path)
79
  return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
80
  elif file_path.endswith(".pdf"):
81
- import fitz # PyMuPDF
82
  text = ""
83
  with fitz.open(file_path) as doc:
84
  for page in doc:
85
  text += page.get_text()
86
  return text
87
- else:
88
  return Path(file_path).read_text()
89
-
 
90
 
91
  def detect_document_type(self, text: str) -> str:
92
  keywords = ['grant', 'funding', 'mission']
@@ -109,7 +93,6 @@ class DocumentChunker:
109
  chunks = []
110
 
111
  if not headers:
112
- # fallback chunking
113
  words = text.split()
114
  for i in range(0, len(words), max_words):
115
  piece = ' '.join(words[i:i + max_words])
@@ -140,6 +123,20 @@ class DocumentChunker:
140
  })
141
  return chunks
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  def extract_topics_tfidf(self, text: str, max_features: int = 3) -> List[str]:
144
  clean = re.sub(r'[^\w\s]', ' ', text.lower())
145
  vectorizer = TfidfVectorizer(max_features=max_features * 2, stop_words='english')
@@ -158,7 +155,6 @@ class DocumentChunker:
158
 
159
  def process_document(self, file_path: str, title: Optional[str] = None) -> List[Dict]:
160
  file_path = Path(file_path)
161
- # text = self.extract_text_from_docx(str(file_path)) if file_path.suffix == ".docx" else file_path.read_text()
162
  text = self.extract_text(str(file_path))
163
  doc_type = self.detect_document_type(text)
164
  headers = self.extract_headers(text, doc_type)
 
7
  from docx import Document
8
  from sentence_transformers import SentenceTransformer
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
+ import fitz # PyMuPDF
11
 
12
 
13
  @dataclass
 
57
  }
58
  }
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  def extract_text(self, file_path: str) -> str:
61
  if file_path.endswith(".docx"):
62
  doc = Document(file_path)
63
  return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
64
  elif file_path.endswith(".pdf"):
 
65
  text = ""
66
  with fitz.open(file_path) as doc:
67
  for page in doc:
68
  text += page.get_text()
69
  return text
70
+ elif file_path.endswith(".txt"):
71
  return Path(file_path).read_text()
72
+ else:
73
+ raise ValueError("Unsupported file format")
74
 
75
  def detect_document_type(self, text: str) -> str:
76
  keywords = ['grant', 'funding', 'mission']
 
93
  chunks = []
94
 
95
  if not headers:
 
96
  words = text.split()
97
  for i in range(0, len(words), max_words):
98
  piece = ' '.join(words[i:i + max_words])
 
123
  })
124
  return chunks
125
 
126
+ def match_category(self, text: str, return_first: bool = True) -> Optional[str] or List[str]:
127
+ lower_text = text.lower()
128
+ match_scores = defaultdict(int)
129
+ for category, patterns in self.category_patterns.items():
130
+ for pattern in patterns:
131
+ matches = re.findall(pattern, lower_text)
132
+ match_scores[category] += len(matches)
133
+
134
+ if not match_scores:
135
+ return None if return_first else []
136
+
137
+ sorted_categories = sorted(match_scores.items(), key=lambda x: -x[1])
138
+ return sorted_categories[0][0] if return_first else [cat for cat, _ in sorted_categories if match_scores[cat] > 0]
139
+
140
  def extract_topics_tfidf(self, text: str, max_features: int = 3) -> List[str]:
141
  clean = re.sub(r'[^\w\s]', ' ', text.lower())
142
  vectorizer = TfidfVectorizer(max_features=max_features * 2, stop_words='english')
 
155
 
156
  def process_document(self, file_path: str, title: Optional[str] = None) -> List[Dict]:
157
  file_path = Path(file_path)
 
158
  text = self.extract_text(str(file_path))
159
  doc_type = self.detect_document_type(text)
160
  headers = self.extract_headers(text, doc_type)