Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -288,77 +288,107 @@ def fDistancePlot(text2Party):
|
|
288 |
return safe_plot(plot_func)
|
289 |
|
290 |
def DispersionPlot(textParty):
|
291 |
-
"""
|
292 |
-
|
|
|
|
|
|
|
293 |
try:
|
294 |
-
word_tokens_party = word_tokenize(textParty)
|
295 |
-
print(f"Debug DispersionPlot: Total tokens: {len(word_tokens_party)}")
|
296 |
if not word_tokens_party:
|
297 |
print("Warning: No tokens found for dispersion plot.")
|
298 |
return None
|
299 |
|
300 |
-
moby = Text(word_tokens_party)
|
301 |
fdistance = FreqDist(word_tokens_party)
|
302 |
-
print(f"Debug DispersionPlot: FreqDist sample: {list(fdistance.most_common(10))}")
|
303 |
|
304 |
# --- Improved word selection logic ---
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
|
|
|
|
310 |
|
311 |
# Select top 5 from filtered list
|
312 |
-
|
313 |
-
|
314 |
-
else:
|
315 |
-
word_Lst = [common_words_filtered[x][0] for x in range(5)]
|
316 |
-
|
317 |
-
# Final check: Ensure words are present in the Text object (moby)
|
318 |
-
final_word_list = [word for word in word_Lst if word in moby] # Check membership in the Text object
|
319 |
-
print(f"Debug DispersionPlot: Final word list for plot: {final_word_list}") # Debug print
|
320 |
|
321 |
if not final_word_list:
|
322 |
-
print("Warning: No suitable words found for dispersion plot
|
323 |
# Create a simple plot indicating no data
|
324 |
fig, ax = plt.subplots(figsize=(8, 3))
|
325 |
ax.text(0.5, 0.5, "No suitable words found for dispersion plot", ha='center', va='center', transform=ax.transAxes)
|
326 |
ax.set_xlim(0, 1)
|
327 |
ax.set_ylim(0, 1)
|
328 |
-
ax.axis('off')
|
329 |
fig.suptitle('Dispersion Plot')
|
330 |
else:
|
331 |
-
# ---
|
332 |
-
fig = plt.
|
333 |
-
|
334 |
-
#
|
335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
plt.tight_layout()
|
337 |
|
338 |
buf = BytesIO()
|
339 |
-
# Handle potential apply_aspect error
|
340 |
try:
|
341 |
-
fig.savefig(buf, format='png', bbox_inches='tight')
|
342 |
except AttributeError as ae:
|
343 |
if "apply_aspect" in str(ae):
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
else:
|
349 |
-
|
350 |
buf.seek(0)
|
351 |
img = Image.open(buf)
|
352 |
-
plt.close(fig)
|
353 |
return img
|
354 |
|
355 |
except Exception as e:
|
356 |
print(f"Dispersion plot error: {e}")
|
357 |
if buf:
|
358 |
-
buf.close()
|
359 |
traceback.print_exc()
|
360 |
-
plt.close('all')
|
361 |
-
return None
|
|
|
362 |
|
363 |
def word_cloud_generator(parsed_text_name, text_Party):
|
364 |
"""Generates the word cloud image."""
|
|
|
288 |
return safe_plot(plot_func)
|
289 |
|
290 |
def DispersionPlot(textParty):
|
291 |
+
"""
|
292 |
+
Generates a dispersion plot using Matplotlib.
|
293 |
+
Shows the positions of the most common words along the text.
|
294 |
+
"""
|
295 |
+
buf = None
|
296 |
try:
|
297 |
+
word_tokens_party = word_tokenize(textParty.lower()) # Lowercase for matching
|
298 |
+
print(f"Debug DispersionPlot: Total tokens: {len(word_tokens_party)}")
|
299 |
if not word_tokens_party:
|
300 |
print("Warning: No tokens found for dispersion plot.")
|
301 |
return None
|
302 |
|
|
|
303 |
fdistance = FreqDist(word_tokens_party)
|
304 |
+
print(f"Debug DispersionPlot: FreqDist sample: {list(fdistance.most_common(10))}")
|
305 |
|
306 |
# --- Improved word selection logic ---
|
307 |
+
common_words_raw = fdistance.most_common(15)
|
308 |
+
# Filter words: length > 2, alphabetic, not just digits
|
309 |
+
common_words_filtered = [
|
310 |
+
(word, freq) for word, freq in common_words_raw
|
311 |
+
if len(word) > 2 and word.isalpha() and not word.isdigit()
|
312 |
+
]
|
313 |
+
print(f"Debug DispersionPlot: Filtered common words: {common_words_filtered}")
|
314 |
|
315 |
# Select top 5 from filtered list
|
316 |
+
final_word_list = [word for word, _ in common_words_filtered[:5]]
|
317 |
+
print(f"Debug DispersionPlot: Final word list for plot: {final_word_list}")
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
|
319 |
if not final_word_list:
|
320 |
+
print("Warning: No suitable words found for dispersion plot.")
|
321 |
# Create a simple plot indicating no data
|
322 |
fig, ax = plt.subplots(figsize=(8, 3))
|
323 |
ax.text(0.5, 0.5, "No suitable words found for dispersion plot", ha='center', va='center', transform=ax.transAxes)
|
324 |
ax.set_xlim(0, 1)
|
325 |
ax.set_ylim(0, 1)
|
326 |
+
ax.axis('off')
|
327 |
fig.suptitle('Dispersion Plot')
|
328 |
else:
|
329 |
+
# --- Create the dispersion plot manually ---
|
330 |
+
fig, ax = plt.subplots(figsize=(12, 6))
|
331 |
+
|
332 |
+
# X-axis: position in the text (token index)
|
333 |
+
x = list(range(len(word_tokens_party)))
|
334 |
+
|
335 |
+
# Y-axis: will be offset for each word for visualization
|
336 |
+
# We'll plot a scatter point for each occurrence of the target words
|
337 |
+
colors = plt.cm.get_cmap('tab10', len(final_word_list))
|
338 |
+
|
339 |
+
for i, word in enumerate(final_word_list):
|
340 |
+
# Find all indices where the word occurs
|
341 |
+
offsets = [j for j, token in enumerate(word_tokens_party) if token == word]
|
342 |
+
y_positions = [i + 1] * len(offsets) # Offset y-position for each word
|
343 |
+
ax.scatter(offsets, y_positions, label=word, color=colors(i), alpha=0.7, s=30) # s is marker size
|
344 |
+
|
345 |
+
ax.set_xlabel("Position in Text (Token Index)")
|
346 |
+
ax.set_ylabel("Words")
|
347 |
+
ax.set_title("Dispersion Plot")
|
348 |
+
|
349 |
+
# Set y-ticks to correspond to the words
|
350 |
+
ax.set_yticks(range(1, len(final_word_list) + 1))
|
351 |
+
ax.set_yticklabels(final_word_list)
|
352 |
+
|
353 |
+
# Invert y-axis so the first word in the list is at the top
|
354 |
+
ax.invert_yaxis()
|
355 |
+
|
356 |
+
# Add grid for better readability
|
357 |
+
ax.grid(True, axis='x', linestyle='--', alpha=0.5)
|
358 |
+
|
359 |
+
# Add legend
|
360 |
+
# ax.legend(title="Words", bbox_to_anchor=(1.05, 1), loc='upper left') # Place legend outside plot
|
361 |
+
# Or, include legend inside if space allows and it's not too cluttered
|
362 |
+
# For simplicity inside the plot area (adjust if needed)
|
363 |
+
# ax.legend(title="Words")
|
364 |
+
|
365 |
plt.tight_layout()
|
366 |
|
367 |
buf = BytesIO()
|
368 |
+
# Handle potential apply_aspect error
|
369 |
try:
|
370 |
+
fig.savefig(buf, format='png', bbox_inches='tight', dpi=150) # Added dpi for clarity
|
371 |
except AttributeError as ae:
|
372 |
if "apply_aspect" in str(ae):
|
373 |
+
print(f"Warning: bbox_inches='tight' failed for Dispersion Plot ({ae}), saving without it.")
|
374 |
+
buf.seek(0)
|
375 |
+
buf = BytesIO()
|
376 |
+
fig.savefig(buf, format='png', dpi=150)
|
377 |
else:
|
378 |
+
raise
|
379 |
buf.seek(0)
|
380 |
img = Image.open(buf)
|
381 |
+
plt.close(fig)
|
382 |
return img
|
383 |
|
384 |
except Exception as e:
|
385 |
print(f"Dispersion plot error: {e}")
|
386 |
if buf:
|
387 |
+
buf.close()
|
388 |
traceback.print_exc()
|
389 |
+
plt.close('all')
|
390 |
+
return None
|
391 |
+
|
392 |
|
393 |
def word_cloud_generator(parsed_text_name, text_Party):
|
394 |
"""Generates the word cloud image."""
|