taarhissian commited on
Commit
7303878
·
verified ·
1 Parent(s): eca9640

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -63
app.py CHANGED
@@ -1,73 +1,41 @@
1
- import easyocr
2
- from PIL import Image
3
- import re
4
  import gradio as gr
5
- import numpy as np # Ensure numpy is imported
6
-
7
- # Initialize the OCR reader
8
- reader = easyocr.Reader(['en'])
9
-
10
- # Define patterns for different room types and dimensions
11
- room_patterns = {
12
- 'bedroom': r'bedroom|bed\s?rm',
13
- 'bathroom': r'bathroom|bath\s?rm',
14
- 'kitchen': r'kitchen',
15
- 'living room': r'living\s?room|sitting\s?room',
16
- 'dining room': r'dining\s?room',
17
- # Add more patterns as needed
18
- }
19
-
20
- dimension_pattern = r"(\d+'\s?\d+\")|(\d+\.?\d*\s?[x×]\s?\d+\.?\d*)" # Pattern to match dimensions like 10'6" or 10x12
21
-
22
- # Function to extract room data with counts and measurements
23
- def extract_room_data(results, room_patterns, dimension_pattern):
24
- room_data = {}
25
-
26
- for result in results:
27
- text = result[1].lower() # Extract the text from the OCR result
28
- for room_type, pattern in room_patterns.items():
29
- if re.search(pattern, text):
30
- # Check if room type is already in the dictionary
31
- if room_type not in room_data:
32
- room_data[room_type] = {"count": 0, "measurements": []}
33
- room_data[room_type]["count"] += 1
34
-
35
- # Find dimensions in the text
36
- dimensions = re.findall(dimension_pattern, text)
37
- if dimensions:
38
- room_data[room_type]["measurements"].extend(dimensions)
39
-
40
- return room_data
41
 
42
- # Function to process the uploaded image
43
- def process_image(image):
44
- # Convert the Gradio image to PIL Image
45
- image = Image.fromarray(image)
46
 
47
- # Perform OCR
48
- results = reader.readtext(np.array(image), detail=0)
49
 
50
- # Extract room information
51
- room_data_with_counts = extract_room_data(results, room_patterns, dimension_pattern)
 
52
 
53
- # Format output for display
54
- output_text = "Extracted Room Data with Counts and Measurements:\n"
55
- for room_type, data in room_data_with_counts.items():
56
- output_text += f"- {room_type.capitalize()}:\n"
57
- output_text += f" Count: {data['count']}\n"
58
- output_text += f" Measurements: {', '.join(data['measurements'])}\n"
 
 
 
 
 
59
 
60
- return output_text
 
61
 
62
- # Create the Gradio interface
63
  iface = gr.Interface(
64
- fn=process_image,
65
- inputs=gr.Image(type="pil"), # Use type="pil" for PIL Image input
66
- outputs="text",
67
- title="Floor Plan Room Detection",
68
- description="Upload a floor plan image to extract room information."
69
  )
70
 
71
- # Launch the interface
72
- iface.launch(share=True) # Set share=True to get a shareable link
73
-
 
1
+ import torch
2
+ from transformers import AutoProcessor, AutoModelForVision2Seq
 
3
  import gradio as gr
4
+ from PIL import Image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ # Load Kosmos-2 Model
7
+ MODEL_NAME = "microsoft/kosmos-2-patch14-224"
 
 
8
 
9
+ processor = AutoProcessor.from_pretrained(MODEL_NAME)
10
+ model = AutoModelForVision2Seq.from_pretrained(MODEL_NAME)
11
 
12
+ # Ensure model is on GPU if available
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ model.to(device)
15
 
16
+ def analyze_image(image, prompt):
17
+ """Process an image with a text prompt using Kosmos-2."""
18
+ try:
19
+ image = Image.fromarray(image) # Convert to PIL Image
20
+ inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
21
+
22
+ # Generate output
23
+ output = model.generate(**inputs, max_length=50)
24
+ result_text = processor.batch_decode(output, skip_special_tokens=True)[0]
25
+
26
+ return result_text
27
 
28
+ except Exception as e:
29
+ return f"Error: {str(e)}"
30
 
31
+ # Gradio Interface
32
  iface = gr.Interface(
33
+ fn=analyze_image,
34
+ inputs=[gr.Image(type="numpy"), gr.Textbox(label="Prompt")],
35
+ outputs=gr.Textbox(label="Generated Response"),
36
+ title="Kosmos-2 Image Reasoning",
37
+ description="Upload an image and provide a text prompt. Kosmos-2 will generate insights based on the image and text input.",
38
  )
39
 
40
+ # Launch the Gradio app
41
+ iface.launch()