DALL·E^[1][1]

   We decided to name our model using a portmanteau of the artist Salvador
   Dalí and Pixar’s WALL·E.

   is a 12-billion parameter version of[2]GPT-3 trained to generate images
   from text descriptions, using a dataset of text–image pairs. We’ve
   found that it has a diverse set of capabilities, including creating
   anthropomorphized versions of animals and objects, combining unrelated
   concepts in plausible ways, rendering text, and applying
   transformations to existing images.

   [3]https://github.com/openai/DALL-E

import io
import os, sys
import requests
import PIL

import torch
import torchvision.transforms as T
import torchvision.transforms.functional as TF

from dall_e          import map_pixels, unmap_pixels, load_model
from IPython.display import display, display_markdown

target_image_size = 256

def download_image(url):
    resp = requests.get(url)
    resp.raise_for_status()
    return PIL.Image.open(io.BytesIO(resp.content))

def preprocess(img):
    s = min(img.size)

    if s < target_image_size:
        raise ValueError(f'min dim for image {s} < {target_image_size}')

    r = target_image_size / s
    s = (round(r * img.size[1]), round(r * img.size[0]))
    img = TF.resize(img, s, interpolation=PIL.Image.LANCZOS)
    img = TF.center_crop(img, output_size=2 * [target_image_size])
    img = torch.unsqueeze(T.ToTensor()(img), 0)
    return map_pixels(img)

# This can be changed to a GPU, e.g. 'cuda:0'.
dev = torch.device('cpu')

# For faster load times, download these files locally and use the local paths in
stead.
enc = load_model("[4]https://cdn.openai.com/dall-e/encoder.pkl", dev)
dec = load_model("[5]https://cdn.openai.com/dall-e/decoder.pkl", dev)

x = preprocess(download_image('[6]https://assets.bwbx.io/images/users/iqjWHBFdfx
IU/iKIWgaiJUtss/v2/1000x-1.jpg'))
display_markdown('Original image:')
display(T.ToPILImage(mode='RGB')(x[0]))

import torch.nn.functional as F

z_logits = enc(x)
z = torch.argmax(z_logits, axis=1)
z = F.one_hot(z, num_classes=enc.vocab_size).permute(0, 3, 1, 2).float()

x_stats = dec(z).float()
x_rec = unmap_pixels(torch.sigmoid(x_stats[:, :3]))
x_rec = T.ToPILImage(mode='RGB')(x_rec[0])

display_markdown('Reconstructed image:')
display(x_rec)

References

   1. https://openai.com/blog/dall-e/#fn1
   2. https://arxiv.org/abs/2005.14165
   3. https://github.com/openai/DALL-E
   4. https://cdn.openai.com/dall-e/encoder.pkl
   5. https://cdn.openai.com/dall-e/decoder.pkl
   6. https://assets.bwbx.io/images/users/iqjWHBFdfxIU/iKIWgaiJUtss/v2/1000x-1.jpg