from openai import OpenAI import os MODEL="gpt-4o" client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) # 1 - Basic Chat completion = client.chat.completions.create( model=MODEL, messages=[ {"role": "system", "content": "You are a helpful assistant. Help me with my math homework!"}, {"role": "user", "content": "Hello! Could you solve 2+2?"} ] ) print("Assistant: " + completion.choices[0].message.content) # 2 - Image Processing: Base64 import base64 IMAGE_PATH = "triangle.png" def encode_image(image_path): with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode("utf-8") base64_image = encode_image(IMAGE_PATH) response = client.chat.completions.create( model=MODEL, messages=[ {"role": "system", "content": "You are a helpful assistant that responds in Markdown. Help me with my math homework!"}, {"role": "user", "content": [ {"type": "text", "text": "What's the area of the triangle?"}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"} } ]} ], temperature=0.0, ) print(response.choices[0].message.content) # 3 - Image Processing: URL response = client.chat.completions.create( model=MODEL, messages=[ {"role": "system", "content": "You are a helpful assistant that responds in Markdown. Help me with my math homework!"}, {"role": "user", "content": [ {"type": "text", "text": "What's the area of the triangle?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/e/e2/The_Algebra_of_Mohammed_Ben_Musa_-_page_82b.png"} } ]} ], temperature=0.0, ) print(response.choices[0].message.content) # 4 - Summarization: Video Summary import cv2 from moviepy.editor import VideoFileClip import time import base64 VIDEO_PATH = "keynote_recap.mp4" def process_video(video_path, seconds_per_frame=2): base64Frames = [] base_video_path, _ = os.path.splitext(video_path) video = cv2.VideoCapture(video_path) total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) fps = video.get(cv2.CAP_PROP_FPS) frames_to_skip = int(fps * seconds_per_frame) curr_frame=0 while curr_frame < total_frames - 1: video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame) success, frame = video.read() if not success: break _, buffer = cv2.imencode(".jpg", frame) base64Frames.append(base64.b64encode(buffer).decode("utf-8")) curr_frame += frames_to_skip video.release() audio_path = f"{base_video_path}.mp3" clip = VideoFileClip(video_path) clip.audio.write_audiofile(audio_path, bitrate="32k") clip.audio.close() clip.close() print(f"Extracted {len(base64Frames)} frames") print(f"Extracted audio to {audio_path}") return base64Frames, audio_path base64Frames, audio_path = process_video(VIDEO_PATH, seconds_per_frame=1) response = client.chat.completions.create( model=MODEL, messages=[ {"role": "system", "content": "You are generating a video summary. Please provide a summary of the video. Respond in Markdown."}, {"role": "user", "content": [ "These are the frames from the video.", *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames) ], } ], temperature=0, ) print(response.choices[0].message.content) # 5 - Summarization: Audio Summary transcription = client.audio.transcriptions.create( model="whisper-1", file=open(audio_path, "rb"), ) response = client.chat.completions.create( model=MODEL, messages=[ {"role": "system", "content":"""You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown."""}, {"role": "user", "content": [ {"type": "text", "text": f"The audio transcription is: {transcription.text}"} ], } ], temperature=0, ) print(response.choices[0].message.content) # 6 - Summarization: Audio + Visual Summary response = client.chat.completions.create( model=MODEL, messages=[ {"role": "system", "content":"""You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown"""}, {"role": "user", "content": [ "These are the frames from the video.", *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames), {"type": "text", "text": f"The audio transcription is: {transcription.text}"} ], } ], temperature=0, ) print(response.choices[0].message.content) # 7 - Q&A: Visual Q&A QUESTION = "Question: Why did Sam Altman have an example about raising windows and turning the radio on?" qa_visual_response = client.chat.completions.create( model=MODEL, messages=[ {"role": "system", "content": "Use the video to answer the provided question. Respond in Markdown."}, {"role": "user", "content": [ "These are the frames from the video.", *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames), QUESTION ], } ], temperature=0, ) print("Visual QA:\n" + qa_visual_response.choices[0].message.content) # 8 - Q&A: Audio Q&A qa_audio_response = client.chat.completions.create( model=MODEL, messages=[ {"role": "system", "content":"""Use the transcription to answer the provided question. Respond in Markdown."""}, {"role": "user", "content": f"The audio transcription is: {transcription.text}. \n\n {QUESTION}"}, ], temperature=0, ) print("Audio QA:\n" + qa_audio_response.choices[0].message.content) # 11 - Q&A: Visual + Audio Q&A qa_both_response = client.chat.completions.create( model=MODEL, messages=[ {"role": "system", "content":"""Use the video and transcription to answer the provided question."""}, {"role": "user", "content": [ "These are the frames from the video.", *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames), {"type": "text", "text": f"The audio transcription is: {transcription.text}"}, QUESTION ], } ], temperature=0, ) print("Both QA:\n" + qa_both_response.choices[0].message.content)
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter