pygame-ce
pygame-ce copied to clipboard
Colorkey blit optimization
Most of pygame game types consist of pixel art, which oftentimes utilizes colokey blitting for effective use of pixel assets. This PR optimizes that with AVX2/SSE2 significantly. This strategy is limited to 32bit surfaces that don't use RLE.
The improvements vary widely with surface size so refer to these graphs for a better visualization:
Current VS AVX2
Current VS SSE2
SSE2VS AVX2
My actual timings files: blit_colorkey_AVX2.json blit_colorkey_old.json blit_colorkey_SSE2.json
All made with the following test program:
import pygame.draw
from utils.data_utils import Plotter, Evaluator
from pygame import Surface
def tests_setup(curr_size: int, g: dict):
base = Surface((curr_size, curr_size))
surf = Surface((curr_size, curr_size))
surf.fill(0xFF00FF00)
surf.set_colorkey(0xFF00FF00)
cs2 = curr_size // 2
pygame.draw.circle(surf, 0xFFFF0000, (cs2, cs2), cs2)
g["surf"] = surf
g["base"] = base
tests = [
("blit_colorkey_PR", "base.blit(surf, (0, 0))"),
]
Evaluator(tests, tests_setup, max_size=1000, reps=100, num=1).run()
files = [
("blit_colorkey_old", "white"),
("blit_colorkey_AVX2", "green"),
("blit_colorkey_SSE2", "blue"),
]
p = Plotter("colorkey_blit", files)
p.plot_tests()
# p.compare([(0, 1)])
And this is the data_utils file if you're interested:
import json
from matplotlib import pyplot as plt
import matplotlib.colors as mcolors
from statistics import mean, stdev, median
from timeit import repeat
COLORS = list(mcolors.CSS4_COLORS.keys())
__all__ = ["Plotter", "Evaluator"]
JSON_DIR = "files_json"
class Plotter:
def __init__(self, title: str, tests: list, mode: str = "MIN",
limit_to_range: int = -1):
plt.style.use(["dark_background"])
self.title = title
self.tests = tests
self.mode = mode
self.mode_func = None
self.limit_to_range = limit_to_range
self.filter(mode)
def filter(self, mode: str):
self.mode = mode
match mode:
case "MEAN":
self.mode_func = mean
case "MIN":
self.mode_func = min
case "MAX":
self.mode_func = max
case "MEDIAN":
self.mode_func = median
def plot_tests(self):
for file_name, color in self.tests:
try:
with open(f"{JSON_DIR}/{file_name}.json", "r") as f:
data = json.load(f)
except FileNotFoundError:
print(f"File {file_name}.json not found!")
quit()
timings = [self.mode_func(dp) for dp in data["data"]][:self.limit_to_range]
print(f"=== {file_name} ===")
print(
f"Total: {sum([sum(data_point) for data_point in data['data']])}\n"
f"Mean: {mean(timings)}\n"
f"Median: {median(timings)}\n"
f"Stdev: {stdev(timings)}"
)
print()
plt.plot(timings, color=color, label=file_name, linewidth=1)
plt.legend()
plt.title(self.title)
plt.xlabel("Surface size (px)")
plt.ylabel("Time (s)")
plt.show()
def compare(self, indices: list[tuple[int, int]], c1="white", c2="lime"):
for i1, i2 in indices:
filename_1, _ = self.tests[i1]
filename_2, _ = self.tests[i2]
try:
with open(f"{JSON_DIR}/{filename_1}.json", "r") as f:
data_1 = json.load(f)
except FileNotFoundError:
print(f"File {filename_1}.json not found!")
quit()
try:
with open(f"{JSON_DIR}/{filename_2}.json", "r") as f:
data_2 = json.load(f)
except FileNotFoundError:
print(f"File {filename_2}.json not found!")
quit()
timings_1 = [self.mode_func(dp) for dp in data_1["data"]][
:self.limit_to_range]
timings_2 = [self.mode_func(dp) for dp in data_2["data"]][
:self.limit_to_range]
plt.figure(figsize=(10, 5))
curr_index = 1
plt.subplot(len(indices), 2, curr_index)
plt.scatter(range(len(timings_1)), timings_1, color=c1, label=filename_1,
s=.5)
plt.scatter(range(len(timings_1)), timings_2, color=c2, label=filename_2,
s=.5)
plt.legend()
plt.title("Timings")
plt.xlabel("Surface size (px)")
plt.ylabel("Time (s)")
plt.subplot(len(indices), 2, curr_index + 1)
comparative_data = [
100 * ((t1 / t2) - 1) for t1, t2 in zip(timings_1, timings_2)
]
plt.scatter(range(len(comparative_data)), comparative_data, color="red", s=1)
plt.plot([0] * len(comparative_data), color="green", linewidth=2)
plt.title("Relative % improvement")
plt.xlabel("Surface size (px)")
plt.show()
class Evaluator:
def __init__(self, tests: list, tests_setup, pre_test_setup=None,
max_size: int = 1000, reps: int = 30,
num: int = 1):
self.tests = tests
self.tests_setup = tests_setup
self.max_size = max_size
self.reps = reps
self.num = num
self.G = {}
if pre_test_setup:
try:
pre_test_setup(self.G)
except TypeError:
raise TypeError("pre_test_setup must be a callable")
def run(self):
for test_name, statement in self.tests:
data = {"title": test_name, "data": []}
print(f"\n========| {test_name.upper()} |========")
for curr_size in range(1, self.max_size + 1):
self.print_progress_bar(curr_size)
self.tests_setup(curr_size, self.G)
data["data"].append(self.run_test(statement))
with open(f"{JSON_DIR}/{test_name}.json", "w") as f:
json.dump(data, f)
def print_progress_bar(self, curr_size: int):
amt = 3 * curr_size // 100
print(
"\r[" + "▪" * amt + " " * (3 * 10 - amt) + f"] {curr_size} | {self.max_size}", end=""
)
def run_test(self, statement):
return repeat(statement, globals=self.G, number=self.num, repeat=self.reps)
When I read the title, I assumed this was a SIMD implementation of alphablit_colorkey. As far as I can understand, that's not it at all.
What you've made is not for alpha blitting all, it's another special cased blit routine to take over from SDL.
Therefore I don't think should be in pygameBlit, because everything in pygameBlit is for alpha or blend flags? If you can determine what code path it's meant for in surface.blit, why not send it there directly?
Secondly, could this be upstreamed to SDL? That way we don't need a special path at all, and everyone's code gets faster.
When I read the title, I assumed this was a SIMD implementation of
alphablit_colorkey. As far as I can understand, that's not it at all. What you've made is not for alpha blitting all, it's another special cased blit routine to take over from SDL.
Yes, that's correct. It's an AVX/SSE implementation of the staight colorkey blit.
Therefore I don't think should be in pygameBlit, because everything in pygameBlit is for alpha or blend flags? If you can determine what code path it's meant for in surface.blit, why not send it there directly?
I included it here to prevent redundant code. Creating a separate function would entail duplicating the rect clipping logic and other checks, which seemed impractical.
Secondly, could this be upstreamed to SDL? That way we don't need a special path at all, and everyone's code gets faster.
It could but how long would we have to wait for it? I've seen people suggest avx/sse improvements to old code and in all got flagged for 3.2.0 at best so I'm afraid we would never see these improvements soon. And generally speaking SDL doesn't look to implement much past SSE code from what I've seen. So the wait could be very long for something that would impact tons of pygame games immediately.
And a note, these improvements, even in the sse case, look to be even better than our best AVX alpha/premultiplied alpha implementations.
I believe that with pygame having "game" in its name and many softwares made with it being pixel art games, this change is really important and worth implementing it directly here and not going through SDL. Besides, it won't require us changing it or expanding it again since it's a unique case.
A little test program for reproducibility (without this PR i get around 200/250 fps and with this PR 470/520):
Images:
import pygame
from random import randint
pygame.init()
screen = pygame.display.set_mode((1000, 1000))
N_TREES = 150
N_GRASS = 4000
N_FLOWERS = 1000
tree = pygame.image.load("tree.png").convert()
tile = pygame.image.load("grass_tile.png").convert()
grass_img = pygame.image.load("grass.png").convert()
flower_img = pygame.image.load("flower.png").convert()
for surf in [tree, tile, grass_img, flower_img]:
surf.set_colorkey((255, 0, 220))
tiles = [
(tile, (j * tile.get_width(), i * tile.get_height())) for i in range(56) for j in
range(32)
]
off_x = tile.get_width() // 2
off_y = tile.get_height() // 2
tiles.extend(
[
(tile, (j * tile.get_width() - off_x, i * tile.get_height() - off_y)) for i in
range(56) for j in range(32)
]
)
trees = [
(tree, [randint(0, 1000 - s) for s in tree.get_size()]) for _ in range(N_TREES)
]
grass = [
(grass_img, [randint(0, 1000 - s) for s in grass_img.get_size()]) for _ in
range(N_GRASS)
]
flowers = [
(flower_img, [randint(0, 1000 - s) for s in flower_img.get_size()]) for _ in
range(N_FLOWERS)
]
for collection in [trees, tiles, grass, flowers]:
collection.sort(key=lambda x: x[1][1])
clock = pygame.Clock()
font = pygame.font.SysFont("Arial", 25, True)
all = tiles
all.extend(grass)
all.extend(flowers)
all.extend(trees)
total_entities = N_TREES + N_GRASS + N_FLOWERS + len(tiles)
while True:
clock.tick_busy_loop(1000)
for event in pygame.event.get():
if event.type == pygame.QUIT:
pygame.quit()
quit()
screen.fblits(all)
fps_text = font.render(f"FPS: {clock.get_fps():.2f} | Entities: {total_entities}",
True, (255, 255, 255), bgcolor=0)
screen.blit(fps_text, (0, 0))
pygame.display.flip()
I believe that with pygame having "game" in its name and many softwares made with it being pixel art games, this change is really important and worth implementing it directly here and not going through SDL. Besides, it won't require us changing it or expanding it again since it's a unique case.
I think you don't want to implement it in SDL because you're less familiar with the development process and foibles of contributing to SDL, you think it would be way harder than implementing it here, and you're probably right.
It would be the most significant contribution to SDL by a pygame(-ce) contributor in my memory. I only suggested it because I think you have what it takes to actually do it.
On SDL2 - while it would be nice to upstream stuff to SDL2 I don't see why we can't slip this in to pygame for now and leave upstreaming stuff as future work. There is probably lots of our AVX2 blitting code that could be upstreamed to SDL, not just this.
I don't think this is the case. I think the reason we have blitting code at all is because the algorithm changed between SDL1 and SDL2. We can't upstream what isn't compatible with what they have.
I don't get if this is a strong/weak no or a strong/weak yes @Starbuck5.