segal.sh

We hosted UMDCTF 2025 about a month ago. This time around, our theme was brainrot. I was inspired to write a challenge involving Italian brainrot, and landed on this challenge involving a latent diffusion model and some interesting RNG reversing.

We are given the following 96x96 image of tralalero tralala and the below script running on the server.

tralalero tralala

import random
import sys

import numpy as np
import torch
from diffusers import AutoencoderKL, DDIMScheduler, UNet2DConditionModel
from PIL import Image
from tqdm.auto import tqdm
from transformers import CLIPTextModel, CLIPTokenizer

def load():
    model_path_clip = "openai/clip-vit-large-patch14"
    clip_tokenizer = CLIPTokenizer.from_pretrained(model_path_clip)
    clip = CLIPTextModel.from_pretrained(model_path_clip, torch_dtype=torch.float32)

    with open("hf_auth", "r") as f:
        auth_token = f.readlines()[0].strip()

    model_path_diffusion = "CompVis/stable-diffusion-v1-4"
    unet = UNet2DConditionModel.from_pretrained(
        model_path_diffusion,
        subfolder="unet",
        use_auth_token=auth_token,
        variant="fp16",
        torch_dtype=torch.float32,
    )
    vae = AutoencoderKL.from_pretrained(
        model_path_diffusion,
        subfolder="vae",
        use_auth_token=auth_token,
        variant="fp16",
        torch_dtype=torch.float32,
    )
    return (clip_tokenizer, clip, unet, vae)

def run(models, state_bytes):
    (clip_tokenizer, clip, unet, vae) = models

    def tensor_to_image(tensor):
        image = (tensor / 2 + 0.5).clamp(0, 1)
        image = image.permute(0, 2, 3, 1).numpy()
        image = (image[0] * 255).round().astype("uint8")
        return Image.fromarray(image)

    def image_to_np(image):
        return np.array(image).astype(np.float32) / 255.0 * 2.0 - 1.0

    def interp_prev_alpha(prev_t, scheduler):
        if prev_t < 0:
            return scheduler.final_alpha_cumprod

        low = prev_t.floor().long()
        high = prev_t.ceil().long()
        rem = prev_t - low

        low_alpha = scheduler.alphas_cumprod[low]
        high_alpha = scheduler.alphas_cumprod[high]
        return low_alpha * rem + high_alpha * (1 - rem)


    @torch.no_grad()
    def stablediffusion(
        prompt="",
        guidance_scale=7.0,
        steps=50,
        state=None,
        width=96,
        height=96,
    ):
        seed = random.randrange(2**32 - 1)
        generator = torch.manual_seed(seed)
        if state is not None:
            generator.set_state(state)

        latent = torch.randn(
            (1, unet.config.in_channels, height // 8, width // 8),
            generator=generator,
            dtype=torch.float32,
        )

        scheduler = DDIMScheduler(
            beta_start=0.00085,
            beta_end=0.012,
            beta_schedule="scaled_linear",
            num_train_timesteps=1000,
            clip_sample=False,
            set_alpha_to_one=False,
        )
        scheduler.set_timesteps(steps)

        tokens_unconditional = clip_tokenizer(
            "",
            padding="max_length",
            max_length=clip_tokenizer.model_max_length,
            truncation=True,
            return_tensors="pt",
            return_overflowing_tokens=True,
        )
        embedding_unconditional = clip(tokens_unconditional.input_ids).last_hidden_state

        tokens_conditional = clip_tokenizer(
            prompt,
            padding="max_length",
            max_length=clip_tokenizer.model_max_length,
            truncation=True,
            return_tensors="pt",
            return_overflowing_tokens=True,
        )
        embedding_conditional = clip(tokens_conditional.input_ids).last_hidden_state

        for t in tqdm(scheduler.timesteps):
            noise_pred_uncond = unet(
                latent, t, encoder_hidden_states=embedding_unconditional
            ).sample

            noise_pred_cond = unet(
                latent, t, encoder_hidden_states=embedding_conditional
            ).sample

            grad = noise_pred_cond - noise_pred_uncond
            noise_pred = noise_pred_uncond + guidance_scale * grad

            prev_t = (
                t - scheduler.config.num_train_timesteps / scheduler.num_inference_steps
            )
            alpha_prod_t = scheduler.alphas_cumprod[t]
            beta_prod_t = 1 - alpha_prod_t

            alpha_prod_t_prev = interp_prev_alpha(prev_t, scheduler)

            alpha_quotient = (alpha_prod_t / alpha_prod_t_prev) ** 0.5
            first_term = (1.0 / alpha_quotient) * latent
            second_term = (1.0 / alpha_quotient) * (beta_prod_t**0.5) * noise_pred
            third_term = ((1 - alpha_prod_t_prev) ** 0.5) * noise_pred
            latent = first_term - second_term + third_term

        image = vae.decode(latent / 0.18215).sample
        return tensor_to_image(image)

    try:
        state = torch.ByteTensor(state_bytes)
        generated_image = stablediffusion(state=state)
    except Exception as e:
        return str(e)

    generated_np = image_to_np(generated_image)

    tralalero_tralala = Image.open("tralalero_tralala.jpg")
    tralalero_tralala_np = image_to_np(tralalero_tralala)

    mse = np.square(tralalero_tralala_np - generated_np).mean()

    if mse > 0.05:
        return "sorry bud im not seeing tralalero tralala"

    with open("flag.txt", "r") as f:
        return f.read()

The details of the code aren't too important; it's just inference code for Stable Diffusion 1.4. The idea of the challenge is to find a torch.Generator state such that Stable Diffusion inference with a null prompt generates tralalero tralala.

If you look at the inference code (or if you know anything about latent diffusion models), you'll see that the RNG is used to generate the starting latent, which is then iteratively denoised to generate the output image. Typically, this denoising would be conditioned on the text prompt; however, we use no prompt in this challenge, so the output will be something completely random.

So, to solve this challenge, we need to find an RNG state that generates a starting latent, which when unconditionally denoised will produce tralalero tralala. The first step is to find a valid starting latent. For those familiar with diffusion models, this may be very obvious. The key is DDIM inversion.

Below is the core update step for a DDIM model:

\[ \bm{x}_{t-1}=\sqrt{\alpha_{t-1}}\left(\frac{\bm{x}_t-\sqrt{1-\alpha_t}\epsilon_\theta^{(t)}(\bm{x}_t)}{\sqrt{\alpha_t}}\right)+\sqrt{1-\alpha_{t-1}}\epsilon_\theta^{(t)}(\bm{x}_t) \]

The idea is that at any given timestep \(t\), the noisy image \(\bm{x}_t\) is some mixture of the original image \(\bm{x}_0\) and some noise \(\epsilon\), specifically \(\bm{x}_t=\sqrt{\alpha_t}\bm{x}_0+\sqrt{1-\alpha_t}\epsilon\), where \(\epsilon\) is Gaussian with variance \(\alpha_t\). \(\alpha_t\) is determined by a noise scheduler and generally starts at 1, decreasing to 0 over time.

When we sample from the DDIM, we start at timestep 1000 (with \(\alpha_t=1\), so a fully noisy image), and gradually move towards timestep 0, recovering a denoised image.

Interestingly, this process is easily reversible. If we reverse the alpha scheduler, we start with \(\alpha_t=0\) and gradually increase to \(\alpha_t=1\). Then we can just solve for \(\bm{x}_t\) as a function of \(\bm{x}_{t-1}\), yielding the below formula:

\[ \bm{x}_t=\sqrt{\alpha_t}\left(\frac{\bm{x}_{t-1}-\sqrt{1-\alpha_{t-1}}\epsilon_\theta^{(t-1)}(\bm{x}_{t-1})}{\sqrt{\alpha_{t-1}}}\right)+\sqrt{1-\alpha_t}\epsilon_\theta^{(t-1)}(\bm{x}_{t-1}) \]

Applying this update formula with a reversed alpha schedule successfully obtains a starting latent that generates any given output image. So all that's left is finding an RNG state that generates the desired latent.

UMDCTF 2025 – italian ai brainrot