DALL·E^[1][1] We decided to name our model using a portmanteau of the artist Salvador Dalí and Pixar’s WALL·E. is a 12-billion parameter version of[2]GPT-3 trained to generate images from text descriptions, using a dataset of text–image pairs. We’ve found that it has a diverse set of capabilities, including creating anthropomorphized versions of animals and objects, combining unrelated concepts in plausible ways, rendering text, and applying transformations to existing images. [3]https://github.com/openai/DALL-E import io import os, sys import requests import PIL import torch import torchvision.transforms as T import torchvision.transforms.functional as TF from dall_e import map_pixels, unmap_pixels, load_model from IPython.display import display, display_markdown target_image_size = 256 def download_image(url): resp = requests.get(url) resp.raise_for_status() return PIL.Image.open(io.BytesIO(resp.content)) def preprocess(img): s = min(img.size) if s < target_image_size: raise ValueError(f'min dim for image {s} < {target_image_size}') r = target_image_size / s s = (round(r * img.size[1]), round(r * img.size[0])) img = TF.resize(img, s, interpolation=PIL.Image.LANCZOS) img = TF.center_crop(img, output_size=2 * [target_image_size]) img = torch.unsqueeze(T.ToTensor()(img), 0) return map_pixels(img) # This can be changed to a GPU, e.g. 'cuda:0'. dev = torch.device('cpu') # For faster load times, download these files locally and use the local paths in stead. enc = load_model("[4]https://cdn.openai.com/dall-e/encoder.pkl", dev) dec = load_model("[5]https://cdn.openai.com/dall-e/decoder.pkl", dev) x = preprocess(download_image('[6]https://assets.bwbx.io/images/users/iqjWHBFdfx IU/iKIWgaiJUtss/v2/1000x-1.jpg')) display_markdown('Original image:') display(T.ToPILImage(mode='RGB')(x[0])) import torch.nn.functional as F z_logits = enc(x) z = torch.argmax(z_logits, axis=1) z = F.one_hot(z, num_classes=enc.vocab_size).permute(0, 3, 1, 2).float() x_stats = dec(z).float() x_rec = unmap_pixels(torch.sigmoid(x_stats[:, :3])) x_rec = T.ToPILImage(mode='RGB')(x_rec[0]) display_markdown('Reconstructed image:') display(x_rec) References 1. https://openai.com/blog/dall-e/#fn1 2. https://arxiv.org/abs/2005.14165 3. https://github.com/openai/DALL-E 4. https://cdn.openai.com/dall-e/encoder.pkl 5. https://cdn.openai.com/dall-e/decoder.pkl 6. https://assets.bwbx.io/images/users/iqjWHBFdfxIU/iKIWgaiJUtss/v2/1000x-1.jpg