Spaces:

Ane4ka
/

422_MTDDP

Running

App Files Files Community

ASureevaA commited on Nov 18

Commit

3680138

1 Parent(s): c14e744

fix image q

Browse files

Files changed (1) hide show

app.py +112 -37

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import tempfile
-from typing import List, Tuple
 import gradio as gr
 import soundfile as soundfile_module
@@ -20,8 +20,32 @@ from transformers import (
 MODEL_STORE = {}
 def get_audio_pipeline(model_key: str):
     if model_key in MODEL_STORE:
         return MODEL_STORE[model_key]
@@ -310,9 +334,18 @@ def estimate_image_depth(image_object):
     predicted_depth_tensor = depth_output["predicted_depth"]
     resized_depth_tensor = torch_functional.interpolate(
-        predicted_depth_tensor.unsqueeze(0).unsqueeze(0),
-        size=image_object.size[::-1],  # (width, height) -> (H, W)
         mode="bicubic",
         align_corners=False,
     )
@@ -335,13 +368,24 @@ def generate_image_caption(image_object, model_key: str) -> str:
 def answer_visual_question(image_object, question_text: str, model_key: str) -> str:
     vqa_pipeline = get_vision_pipeline(model_key)
-    vqa_result = vqa_pipeline(image_object, question_text)
-    answer_text = vqa_result[0]["answer"]
-    confidence_value = vqa_result[0]["score"]
-    return f"{answer_text} (confidence: {confidence_value:.3f})"
 def perform_zero_shot_classification(
     image_object,
@@ -379,11 +423,13 @@ def perform_zero_shot_classification(
 def retrieve_best_image(
-    image_list: List,
     query_text: str,
     clip_key: str,
-):
-    if not image_list or not query_text:
         return "Пожалуйста, загрузите изображения и введите запрос", None
     clip_model, clip_processor = get_clip_components(clip_key)
@@ -426,44 +472,83 @@ def retrieve_best_image(
 def segment_image_with_sam_points(
     image_object,
-    point_coordinates_list: List[List[int]] | None,
-) -> Image:
     if not point_coordinates_list:
         return Image.new("L", image_object.size, color=0)
     sam_model, sam_processor = get_sam_components()
-    batched_points = [point_coordinates_list]
-    batched_labels = [[1 for _ in point_coordinates_list]]
     sam_inputs = sam_processor(
-        image_object,
         input_points=batched_points,
         input_labels=batched_labels,
         return_tensors="pt",
     )
     with torch.no_grad():
-        sam_outputs = sam_model(**sam_inputs)
-    post_processed_masks_list = sam_processor.image_processor.post_process_masks(
-        sam_outputs.pred_masks.cpu(),
         sam_inputs["original_sizes"].cpu(),
         sam_inputs["reshaped_input_sizes"].cpu(),
     )
-    batched_masks_tensor = post_processed_masks_list[0]  # shape: [num_masks, H, W]
-    if batched_masks_tensor.ndim != 3 or batched_masks_tensor.shape[0] == 0:
         return Image.new("L", image_object.size, color=0)
-    first_mask_tensor = batched_masks_tensor[0]  # [H, W]
-    mask_array = first_mask_tensor.cpu().numpy()
-    mask_image = Image.fromarray((mask_array * 255.0).astype("uint8"), mode="L")
     return mask_image
 def parse_point_coordinates_text(coordinates_text: str) -> List[List[int]]:
     if not coordinates_text.strip():
         return []
@@ -485,16 +570,6 @@ def parse_point_coordinates_text(coordinates_text: str) -> List[List[int]]:
     return point_list
-def segment_image_with_sam_points_ui(
-    image_object,
-    coordinates_text: str,
-):
-    point_coordinates_list = parse_point_coordinates_text(coordinates_text)
-    return segment_image_with_sam_points(image_object, point_coordinates_list)
 def build_interface():
     with gr.Blocks(title="Multimodal AI Demo", theme=gr.themes.Soft()) as demo_block:
         gr.Markdown("#Мультимодальные AI модели")
@@ -588,9 +663,8 @@ def build_interface():
                 inputs=[asr_audio_input_component, asr_model_selector],
                 outputs=asr_output_component,
             )
         with gr.Tab("Синтез речи"):
-            gr.Markdown("## Text-to-Speech (TTS)")
             with gr.Row():
                 with gr.Column():
                     tts_text_component = gr.Textbox(
@@ -612,11 +686,12 @@ def build_interface():
                 with gr.Column():
                     tts_audio_output_component = gr.Audio(
                         label="Синтезированная речь",
                     )
             tts_button.click(
                 fn=synthesize_speech,
-                inputs=tts_text_component,
                 outputs=tts_audio_output_component,
             )
@@ -706,7 +781,7 @@ def build_interface():
                     )
                     sam_coordinates_text = gr.Textbox(
                         label="Координаты точек",
-                        placeholder="100,150; 200,220",
                         lines=2,
                     )
                     sam_button = gr.Button("Сегментировать по точкам")

 import tempfile
+from typing import List, Tuple, Any
 import gradio as gr
 import soundfile as soundfile_module
 MODEL_STORE = {}
+def _normalize_gallery_images(gallery_value: Any) -> List[Image.Image]:
+    if not gallery_value:
+        return []
+    normalized_images: List[Image.Image] = []
+    for item in gallery_value:
+        if isinstance(item, Image.Image):
+            normalized_images.append(item)
+            continue
+        if isinstance(item, (list, tuple)) and item:
+            candidate = item[0]
+            if isinstance(candidate, Image.Image):
+                normalized_images.append(candidate)
+                continue
+        if isinstance(item, dict):
+            candidate = item.get("image") or item.get("value")
+            if isinstance(candidate, Image.Image):
+                normalized_images.append(candidate)
+                continue
+    return normalized_images
 def get_audio_pipeline(model_key: str):
     if model_key in MODEL_STORE:
         return MODEL_STORE[model_key]
     predicted_depth_tensor = depth_output["predicted_depth"]
+    if predicted_depth_tensor.ndim == 3:
+        predicted_depth_tensor = predicted_depth_tensor.unsqueeze(1)
+    elif predicted_depth_tensor.ndim == 2:
+        predicted_depth_tensor = predicted_depth_tensor.unsqueeze(0).unsqueeze(0)
+    else:
+        raise ValueError(
+            f"Неожиданная размерность predicted_depth: {predicted_depth_tensor.shape}"
+        )
     resized_depth_tensor = torch_functional.interpolate(
+        predicted_depth_tensor,
+        size=image_object.size[::-1],
         mode="bicubic",
         align_corners=False,
     )
 def answer_visual_question(image_object, question_text: str, model_key: str) -> str:
+    if image_object is None:
+        return "Пожалуйста, сначала загрузите изображение."
+    if not question_text.strip():
+        return "Пожалуйста, введите вопрос об изображении."
     vqa_pipeline = get_vision_pipeline(model_key)
+    vqa_result = vqa_pipeline(
+        image=image_object,
+        question=question_text,
+    )
+    top_item = vqa_result[0]
+    answer_text = top_item["answer"]
+    confidence_value = top_item["score"]
+    return f"{answer_text} (confidence: {confidence_value:.3f})"
 def perform_zero_shot_classification(
     image_object,
 def retrieve_best_image(
+    gallery_value: Any,
     query_text: str,
     clip_key: str,
+) -> Tuple[str, Image.Image | None]:
+    image_list = _normalize_gallery_images(gallery_value)
+    if not image_list or not query_text.strip():
         return "Пожалуйста, загрузите изображения и введите запрос", None
     clip_model, clip_processor = get_clip_components(clip_key)
 def segment_image_with_sam_points(
     image_object,
+    point_coordinates_list: List[List[int]],
+) -> Image.Image:
+    if image_object is None:
+        raise ValueError("Изображение не передано в segment_image_with_sam_points")
     if not point_coordinates_list:
         return Image.new("L", image_object.size, color=0)
     sam_model, sam_processor = get_sam_components()
+    batched_points: List[List[List[int]]] = [point_coordinates_list]
+    batched_labels: List[List[int]] = [[1 for _ in point_coordinates_list]]
     sam_inputs = sam_processor(
+        image=image_object,
         input_points=batched_points,
         input_labels=batched_labels,
         return_tensors="pt",
     )
     with torch.no_grad():
+        sam_outputs = sam_model(**sam_inputs, multimask_output=True)
+    processed_masks_list = sam_processor.image_processor.post_process_masks(
+        sam_outputs.pred_masks.squeeze(1).cpu(),
         sam_inputs["original_sizes"].cpu(),
         sam_inputs["reshaped_input_sizes"].cpu(),
     )
+    batch_masks_tensor = processed_masks_list[0]
+    if batch_masks_tensor.ndim != 3 or batch_masks_tensor.shape[0] == 0:
         return Image.new("L", image_object.size, color=0)
+    first_mask_tensor = batch_masks_tensor[0]
+    mask_array = first_mask_tensor.numpy()
+    binary_mask_array = (mask_array > 0.5).astype("uint8") * 255
+    mask_image = Image.fromarray(binary_mask_array, mode="L")
     return mask_image
+def segment_image_with_sam_points_ui(image_object, coordinates_text: str) -> Image.Image:
+    if image_object is None:
+        return None
+    coordinates_text_clean = coordinates_text.strip()
+    if not coordinates_text_clean:
+        return Image.new("L", image_object.size, color=0)
+    point_coordinates_list: List[List[int]] = []
+    for raw_pair in coordinates_text_clean.replace("\n", ";").split(";"):
+        raw_pair_clean = raw_pair.strip()
+        if not raw_pair_clean:
+            continue
+        parts = raw_pair_clean.split(",")
+        if len(parts) != 2:
+            continue
+        try:
+            x_value = int(parts[0].strip())
+            y_value = int(parts[1].strip())
+        except ValueError:
+            continue
+        point_coordinates_list.append([x_value, y_value])
+    if not point_coordinates_list:
+        return Image.new("L", image_object.size, color=0)
+    return segment_image_with_sam_points(image_object, point_coordinates_list)
 def parse_point_coordinates_text(coordinates_text: str) -> List[List[int]]:
     if not coordinates_text.strip():
         return []
     return point_list
 def build_interface():
     with gr.Blocks(title="Multimodal AI Demo", theme=gr.themes.Soft()) as demo_block:
         gr.Markdown("#Мультимодальные AI модели")
                 inputs=[asr_audio_input_component, asr_model_selector],
                 outputs=asr_output_component,
             )
         with gr.Tab("Синтез речи"):
+            gr.Markdown("## Text-to-Speech")
             with gr.Row():
                 with gr.Column():
                     tts_text_component = gr.Textbox(
                 with gr.Column():
                     tts_audio_output_component = gr.Audio(
                         label="Синтезированная речь",
+                        type="filepath",
                     )
             tts_button.click(
                 fn=synthesize_speech,
+                inputs=[tts_text_component, tts_model_selector],
                 outputs=tts_audio_output_component,
             )
                     )
                     sam_coordinates_text = gr.Textbox(
                         label="Координаты точек",
+                        placeholder="650,380; 600,450; 550,520",
                         lines=2,
                     )
                     sam_button = gr.Button("Сегментировать по точкам")