AI & Tech

Vision Foundation Models: Implementation and Business Applications

Ethan Michael

May 3, 2025
7 Min Read

Vision Foundation Models: Implementation and Business Applications

from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
from PIL import Image
import requests
import matplotlib.pyplot as plt
import numpy as np
from io import BytesIO


# Load BLIP-2 model
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)


if torch.cuda.is_available():
   model = model.to("cuda")


# Function to download image from URL
def download_image(url):
   response = requests.get(url, stream=True)
   return Image.open(BytesIO(response.content)).convert('RGB')


# Function for image captioning
def generate_caption(image_path):
   # Load image from path or URL
   if isinstance(image_path, str):
       if image_path.startswith(('http://', 'https://')):
           image = download_image(image_path)
       else:
           image = Image.open(image_path).convert('RGB')
   else:
       image = image_path


   inputs = processor(images=image, return_tensors="pt")


   if torch.cuda.is_available():
       inputs = {k: v.to("cuda") for k, v in inputs.items()}


   generated_ids = model.generate(**inputs, max_new_tokens=50)
   generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()


   return generated_text


# Function for visual question answering
def visual_qa(image_path, question):
   # Load image from path or URL
   if isinstance(image_path, str):
       if image_path.startswith(('http://', 'https://')):
           image = download_image(image_path)
       else:
           image = Image.open(image_path).convert('RGB')
   else:
       image = image_path


   # FIX: Properly format the question for the model
   # BLIP-2 needs a specific prompt format for QA
   prompt = f"Question: {question} Answer:"
   inputs = processor(images=image, text=prompt, return_tensors="pt")


   if torch.cuda.is_available():
       inputs = {k: v.to("cuda") for k, v in inputs.items()}


   generated_ids = model.generate(
       **inputs,
       max_new_tokens=30,
       do_sample=False  # Use greedy decoding for more precise answers
   )


   answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
   # Remove the prompt part from the answer
   answer = answer.replace(prompt, "").strip()


   return answer


# Function to visualize image with caption and QA
def visualize_product_analysis(image_path, questions=None):
   # Load image
   if isinstance(image_path, str):
       if image_path.startswith(('http://', 'https://')):
           image = download_image(image_path)
       else:
           image = Image.open(image_path).convert('RGB')
   else:
       image = image_path


   # Generate caption
   caption = generate_caption(image)


   # Default questions if none provided
   if questions is None:
       questions = [
           "What color is this product?",
           "What material is this product made of?",
           "What is the target demographic for this product?",
           "What is a key feature of this product?"
       ]


   # Get answers
   answers = []
   for question in questions:
       answer = visual_qa(image, question)
       answers.append((question, answer))


   # Create visualization
   plt.figure(figsize=(12, 10))


   # Display image
   plt.subplot(2, 1, 1)
   plt.imshow(np.array(image))
   plt.title("Product Image", fontsize=14)
   plt.axis('off')


   # Display caption and Q&A
   plt.subplot(2, 1, 2)
   plt.axis('off')


   text_content = f"Generated Description: {caption}nn"
   text_content += "Product Analysis:n"
   for q, a in answers:
       text_content += f"Q: {q}nA: {a}nn"


   plt.text(0.01, 0.99, text_content, transform=plt.gca().transAxes,
            fontsize=12, verticalalignment="top", wrap=True)


   plt.tight_layout()
   plt.show()


   return caption, answers


# Business application: Automated product listing
def create_product_listing(image_path):
   # Load image
   if isinstance(image_path, str):
       if image_path.startswith(('http://', 'https://')):
           image = download_image(image_path)
       else:
           image = Image.open(image_path).convert('RGB')
   else:
       image = image_path


   # Get basic caption
   caption = generate_caption(image)


   # Extract product attributes with more specific prompting
   color = visual_qa(image, "What colors are visible in this product?")
   material = visual_qa(image, "What material does this product appear to be made of?")
   use_case = visual_qa(image, "What would be the main use case for this product?")
   unique_features = visual_qa(image, "What are any unique or notable features of this product?")


   # Create structured listing
   listing = {
       "title": caption,
       "attributes": {
           "color": color,
           "material": material,
           "primary_use": use_case,
           "unique_features": unique_features
       }
   }


   # Visualize the listing
   plt.figure(figsize=(14, 10))


   # Display image
   plt.subplot(1, 2, 1)
   plt.imshow(np.array(image))
   plt.title("Product Image", fontsize=14)
   plt.axis('off')


   # Display listing details
   plt.subplot(1, 2, 2)
   plt.axis('off')


   listing_text = f"PRODUCT LISTINGnn"
   listing_text += f"Title: {listing['title']}nn"
   listing_text += "Product Attributes:n"
   for attr, value in listing['attributes'].items():
       listing_text += f"{attr.replace('_', ' ').title()}: {value}n"


   plt.text(0.01, 0.99, listing_text, transform=plt.gca().transAxes,
            fontsize=12, verticalalignment="top")


   plt.tight_layout()
   plt.show()


   return listing


# Function for marketing content analysis
def analyze_marketing_content(image_path):
   # Load image
   if isinstance(image_path, str):
       if image_path.startswith(('http://', 'https://')):
           image = download_image(image_path)
       else:
           image = Image.open(image_path).convert('RGB')
   else:
       image = image_path


   # Marketing-specific questions
   marketing_questions = [
       "What emotions does this image evoke?",
       "What brand values are communicated in this image?",
       "What target audience would this image appeal to?",
       "What call to action would pair well with this image?",
       "What marketing channel would this image be most effective on?"
   ]


   # Get answers
   marketing_insights = {}
   for question in marketing_questions:
       answer = visual_qa(image, question)
       key = question.split("?")[0].strip().lower().replace(" ", "_")
       marketing_insights[key] = answer


   # Visualize the analysis
   plt.figure(figsize=(14, 10))


   # Display image
   plt.subplot(1, 2, 1)
   plt.imshow(np.array(image))
   plt.title("Marketing Visual", fontsize=14)
   plt.axis('off')


   # Display marketing insights
   plt.subplot(1, 2, 2)
   plt.axis('off')


   insights_text = "MARKETING CONTENT ANALYSISnn"
   for question, key in zip(marketing_questions, marketing_insights.keys()):
       insights_text += f"{question}n{marketing_insights[key]}nn"


   plt.text(0.01, 0.99, insights_text, transform=plt.gca().transAxes,
            fontsize=12, verticalalignment="top")


   plt.tight_layout()
   plt.show()


   return marketing_insights


# Function for social media understanding
def analyze_social_media_content(image_path):
   # Load image
   if isinstance(image_path, str):
       if image_path.startswith(('http://', 'https://')):
           image = download_image(image_path)
       else:
           image = Image.open(image_path).convert('RGB')
   else:
       image = image_path


   # Generate caption
   caption = generate_caption(image)


   # Social media specific analysis
   engagement_potential = visual_qa(image, "How likely is this image to engage viewers on social media?")
   suggested_hashtags = visual_qa(image, "What hashtags would be appropriate for this image on social media?")
   platform_fit = visual_qa(image, "Which social media platform would this image perform best on?")
   content_type = visual_qa(image, "What type of social media post would this image be suitable for?")


   # Create analysis dict
   social_analysis = {
       "caption": caption,
       "engagement_potential": engagement_potential,
       "suggested_hashtags": suggested_hashtags,
       "platform_fit": platform_fit,
       "content_type": content_type
   }


   # Visualize the analysis
   plt.figure(figsize=(14, 10))


   # Display image
   plt.subplot(1, 2, 1)
   plt.imshow(np.array(image))
   plt.title("Social Media Content", fontsize=14)
   plt.axis('off')


   # Display social media insights
   plt.subplot(1, 2, 2)
   plt.axis('off')


   insights_text = "SOCIAL MEDIA CONTENT ANALYSISnn"
   insights_text += f"Caption: {social_analysis['caption']}nn"
   insights_text += f"Engagement Potential: {social_analysis['engagement_potential']}nn"
   insights_text += f"Suggested Hashtags: {social_analysis['suggested_hashtags']}nn"
   insights_text += f"Best Platform: {social_analysis['platform_fit']}nn"
   insights_text += f"Content Type: {social_analysis['content_type']}n"


   plt.text(0.01, 0.99, insights_text, transform=plt.gca().transAxes,
            fontsize=12, verticalalignment="top")


   plt.tight_layout()
   plt.show()


   return social_analysis


# Example usage
if __name__ == "__main__":
   # Example: E-commerce product analysis
   product_url = "https://images.unsplash.com/photo-1598033129183-c4f50c736f10?w=800"


   print("1. Basic Product Analysis")
   caption, qa_results = visualize_product_analysis(product_url)


   print("n2. Creating Automated Product Listing")
   product_listing = create_product_listing(product_url)


   print("n3. Marketing Content Analysis")
   marketing_url = "https://images.unsplash.com/photo-1581252584837-9f0b1d3bf82c?ixlib=rb-4.0.3&q=80"
   marketing_insights = analyze_marketing_content(marketing_url)


   print("n4. Social Media Content Analysis")
   social_url = "https://images.unsplash.com/photo-1534442072653-dbbf80c5e1ae?ixlib=rb-4.0.3&q=80"
   social_analysis = analyze_social_media_content(social_url)

From ELIZA to Conversation Modeling: Evolution of Conversational AI Systems and Paradigms

Previous Post

From ELIZA to Conversation Modeling: Evolution of Conversational AI Systems and Paradigms

Oversight at Scale Isn’t Guaranteed: MIT Researchers Quantify the Fragility of Nested AI Supervision with New Elo-Based Framework

Next Post

Oversight at Scale Isn’t Guaranteed: MIT Researchers Quantify the Fragility of Nested AI Supervision with New Elo-Based Framework

Related Posts