pygame-ce Colorkey blit optimization

trafficstars

Most of pygame game types consist of pixel art, which oftentimes utilizes colokey blitting for effective use of pixel assets. This PR optimizes that with AVX2/SSE2 significantly. This strategy is limited to 32bit surfaces that don't use RLE.

The improvements vary widely with surface size so refer to these graphs for a better visualization: Figure_1 Current VS AVX2 Current VS SSE2 SSE2VS AVX2

My actual timings files: blit_colorkey_AVX2.json blit_colorkey_old.json blit_colorkey_SSE2.json

All made with the following test program:

import pygame.draw

from utils.data_utils import Plotter, Evaluator
from pygame import Surface


def tests_setup(curr_size: int, g: dict):
    base = Surface((curr_size, curr_size))
    surf = Surface((curr_size, curr_size))
    surf.fill(0xFF00FF00)
    surf.set_colorkey(0xFF00FF00)

    cs2 = curr_size // 2

    pygame.draw.circle(surf, 0xFFFF0000, (cs2, cs2), cs2)

    g["surf"] = surf
    g["base"] = base


tests = [
    ("blit_colorkey_PR", "base.blit(surf, (0, 0))"),
]

Evaluator(tests, tests_setup, max_size=1000, reps=100, num=1).run()

files = [
    ("blit_colorkey_old", "white"),
    ("blit_colorkey_AVX2", "green"),
    ("blit_colorkey_SSE2", "blue"),
]

p = Plotter("colorkey_blit", files)
p.plot_tests()
# p.compare([(0, 1)])

And this is the data_utils file if you're interested:

import json

from matplotlib import pyplot as plt
import matplotlib.colors as mcolors

from statistics import mean, stdev, median
from timeit import repeat

COLORS = list(mcolors.CSS4_COLORS.keys())

__all__ = ["Plotter", "Evaluator"]

JSON_DIR = "files_json"

class Plotter:

    def __init__(self, title: str, tests: list, mode: str = "MIN",
                 limit_to_range: int = -1):
        plt.style.use(["dark_background"])
        self.title = title
        self.tests = tests

        self.mode = mode
        self.mode_func = None
        self.limit_to_range = limit_to_range
        self.filter(mode)

    def filter(self, mode: str):
        self.mode = mode

        match mode:
            case "MEAN":
                self.mode_func = mean
            case "MIN":
                self.mode_func = min
            case "MAX":
                self.mode_func = max
            case "MEDIAN":
                self.mode_func = median

    def plot_tests(self):
        for file_name, color in self.tests:
            try:
                with open(f"{JSON_DIR}/{file_name}.json", "r") as f:
                    data = json.load(f)
            except FileNotFoundError:
                print(f"File {file_name}.json not found!")
                quit()

            timings = [self.mode_func(dp) for dp in data["data"]][:self.limit_to_range]

            print(f"=== {file_name} ===")
            print(
                f"Total: {sum([sum(data_point) for data_point in data['data']])}\n"
                f"Mean: {mean(timings)}\n"
                f"Median: {median(timings)}\n"
                f"Stdev: {stdev(timings)}"
            )
            print()
            plt.plot(timings, color=color, label=file_name, linewidth=1)
        plt.legend()
        plt.title(self.title)
        plt.xlabel("Surface size (px)")
        plt.ylabel("Time (s)")
        plt.show()

    def compare(self, indices: list[tuple[int, int]], c1="white", c2="lime"):
        for i1, i2 in indices:
            filename_1, _ = self.tests[i1]
            filename_2, _ = self.tests[i2]

            try:
                with open(f"{JSON_DIR}/{filename_1}.json", "r") as f:
                    data_1 = json.load(f)
            except FileNotFoundError:
                print(f"File {filename_1}.json not found!")
                quit()

            try:
                with open(f"{JSON_DIR}/{filename_2}.json", "r") as f:
                    data_2 = json.load(f)
            except FileNotFoundError:
                print(f"File {filename_2}.json not found!")
                quit()

            timings_1 = [self.mode_func(dp) for dp in data_1["data"]][
                        :self.limit_to_range]
            timings_2 = [self.mode_func(dp) for dp in data_2["data"]][
                        :self.limit_to_range]

            plt.figure(figsize=(10, 5))
            curr_index = 1

            plt.subplot(len(indices), 2, curr_index)
            plt.scatter(range(len(timings_1)), timings_1, color=c1, label=filename_1,
                        s=.5)
            plt.scatter(range(len(timings_1)), timings_2, color=c2, label=filename_2,
                        s=.5)
            plt.legend()
            plt.title("Timings")
            plt.xlabel("Surface size (px)")
            plt.ylabel("Time (s)")

            plt.subplot(len(indices), 2, curr_index + 1)
            comparative_data = [
                100 * ((t1 / t2) - 1) for t1, t2 in zip(timings_1, timings_2)
            ]
            plt.scatter(range(len(comparative_data)), comparative_data, color="red", s=1)
            plt.plot([0] * len(comparative_data), color="green", linewidth=2)
            plt.title("Relative % improvement")
            plt.xlabel("Surface size (px)")

        plt.show()


class Evaluator:
    def __init__(self, tests: list, tests_setup, pre_test_setup=None,
                 max_size: int = 1000, reps: int = 30,
                 num: int = 1):
        self.tests = tests
        self.tests_setup = tests_setup
        self.max_size = max_size
        self.reps = reps
        self.num = num
        self.G = {}

        if pre_test_setup:
            try:
                pre_test_setup(self.G)
            except TypeError:
                raise TypeError("pre_test_setup must be a callable")

    def run(self):
        for test_name, statement in self.tests:
            data = {"title": test_name, "data": []}

            print(f"\n========| {test_name.upper()} |========")

            for curr_size in range(1, self.max_size + 1):

                self.print_progress_bar(curr_size)

                self.tests_setup(curr_size, self.G)
                data["data"].append(self.run_test(statement))

            with open(f"{JSON_DIR}/{test_name}.json", "w") as f:
                json.dump(data, f)

    def print_progress_bar(self, curr_size: int):
        amt = 3 * curr_size // 100
        print(
            "\r[" + "▪" * amt + " " * (3 * 10 - amt) + f"] {curr_size} | {self.max_size}", end=""
        )

    def run_test(self, statement):
        return repeat(statement, globals=self.G, number=self.num, repeat=self.reps)

Jan 27 '24 14:01 itzpr3d4t0r

When I read the title, I assumed this was a SIMD implementation of alphablit_colorkey. As far as I can understand, that's not it at all.

What you've made is not for alpha blitting all, it's another special cased blit routine to take over from SDL.

Therefore I don't think should be in pygameBlit, because everything in pygameBlit is for alpha or blend flags? If you can determine what code path it's meant for in surface.blit, why not send it there directly?

Secondly, could this be upstreamed to SDL? That way we don't need a special path at all, and everyone's code gets faster.

Jan 30 '24 07:01 Starbuck5

When I read the title, I assumed this was a SIMD implementation of alphablit_colorkey. As far as I can understand, that's not it at all. What you've made is not for alpha blitting all, it's another special cased blit routine to take over from SDL.

Yes, that's correct. It's an AVX/SSE implementation of the staight colorkey blit.

Therefore I don't think should be in pygameBlit, because everything in pygameBlit is for alpha or blend flags? If you can determine what code path it's meant for in surface.blit, why not send it there directly?

I included it here to prevent redundant code. Creating a separate function would entail duplicating the rect clipping logic and other checks, which seemed impractical.

Secondly, could this be upstreamed to SDL? That way we don't need a special path at all, and everyone's code gets faster.

It could but how long would we have to wait for it? I've seen people suggest avx/sse improvements to old code and in all got flagged for 3.2.0 at best so I'm afraid we would never see these improvements soon. And generally speaking SDL doesn't look to implement much past SSE code from what I've seen. So the wait could be very long for something that would impact tons of pygame games immediately.

And a note, these improvements, even in the sse case, look to be even better than our best AVX alpha/premultiplied alpha implementations.

Feb 04 '24 12:02 itzpr3d4t0r

I believe that with pygame having "game" in its name and many softwares made with it being pixel art games, this change is really important and worth implementing it directly here and not going through SDL. Besides, it won't require us changing it or expanding it again since it's a unique case.

Feb 04 '24 12:02 itzpr3d4t0r

A little test program for reproducibility (without this PR i get around 200/250 fps and with this PR 470/520): Images: flower grass grass_tile tree

import pygame
from random import randint

pygame.init()

screen = pygame.display.set_mode((1000, 1000))

N_TREES = 150
N_GRASS = 4000
N_FLOWERS = 1000

tree = pygame.image.load("tree.png").convert()
tile = pygame.image.load("grass_tile.png").convert()
grass_img = pygame.image.load("grass.png").convert()
flower_img = pygame.image.load("flower.png").convert()

for surf in [tree, tile, grass_img, flower_img]:
    surf.set_colorkey((255, 0, 220))

tiles = [
    (tile, (j * tile.get_width(), i * tile.get_height())) for i in range(56) for j in
    range(32)
]
off_x = tile.get_width() // 2
off_y = tile.get_height() // 2

tiles.extend(
    [
        (tile, (j * tile.get_width() - off_x, i * tile.get_height() - off_y)) for i in
        range(56) for j in range(32)
    ]
)

trees = [
    (tree, [randint(0, 1000 - s) for s in tree.get_size()]) for _ in range(N_TREES)
]

grass = [
    (grass_img, [randint(0, 1000 - s) for s in grass_img.get_size()]) for _ in
    range(N_GRASS)
]

flowers = [
    (flower_img, [randint(0, 1000 - s) for s in flower_img.get_size()]) for _ in
    range(N_FLOWERS)
]

for collection in [trees, tiles, grass, flowers]:
    collection.sort(key=lambda x: x[1][1])

clock = pygame.Clock()

font = pygame.font.SysFont("Arial", 25, True)

all = tiles
all.extend(grass)
all.extend(flowers)
all.extend(trees)

total_entities = N_TREES + N_GRASS + N_FLOWERS + len(tiles)

while True:
    clock.tick_busy_loop(1000)

    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            pygame.quit()
            quit()

    screen.fblits(all)

    fps_text = font.render(f"FPS: {clock.get_fps():.2f} | Entities: {total_entities}",
                           True, (255, 255, 255), bgcolor=0)
    screen.blit(fps_text, (0, 0))

    pygame.display.flip()

Feb 04 '24 15:02 itzpr3d4t0r

I believe that with pygame having "game" in its name and many softwares made with it being pixel art games, this change is really important and worth implementing it directly here and not going through SDL. Besides, it won't require us changing it or expanding it again since it's a unique case.

I think you don't want to implement it in SDL because you're less familiar with the development process and foibles of contributing to SDL, you think it would be way harder than implementing it here, and you're probably right.

It would be the most significant contribution to SDL by a pygame(-ce) contributor in my memory. I only suggested it because I think you have what it takes to actually do it.

On SDL2 - while it would be nice to upstream stuff to SDL2 I don't see why we can't slip this in to pygame for now and leave upstreaming stuff as future work. There is probably lots of our AVX2 blitting code that could be upstreamed to SDL, not just this.

I don't think this is the case. I think the reason we have blitting code at all is because the algorithm changed between SDL1 and SDL2. We can't upstream what isn't compatible with what they have.

Feb 26 '24 07:02 Starbuck5

I don't get if this is a strong/weak no or a strong/weak yes @Starbuck5.

Mar 28 '24 10:03 itzpr3d4t0r

pygame-ce pygame-ce copied to clipboard

Colorkey blit optimization

pygame-ce
pygame-ce copied to clipboard