memory leak even after called workbook.close
env
python 3.11 python-calamine==0.3.1
code
def get_table_header(self) -> StructureInfo:
_, source = self._get_extension_and_source()
import python_calamine as calamine
workbook = calamine.load_workbook(source)
sheets = workbook.sheet_names
column_info: dict[str, ColumnList] = {}
for sheet_name in sheets:
# 读取工作表中的数据
sheet = workbook.get_sheet_by_name(sheet_name)
data = sheet.iter_rows()
header = next(data)
df = pd.DataFrame(data, columns=header)
# 获取列信息(表头)
column_info[sheet_name] = self._get_table_header(df)
del sheet, df
break # 如果只需要处理一个工作表,使用break退出循环
workbook.close()
del workbook
I has tried many way to make memory not grow up. like 1) with load_workbook() as workbook: 2) sheet.to_python()
memory info:
- ps auxf
# before read a 20MB excel
root 3752960 78.0 3.5 1705596 718732 pts/0 Sl+ 19:52 0:16 \_ /aipaas/.venv/bin/python /aipaas/.venv/bin/uvicorn --app-dir /aipaas/src aipaas.main:app --host 0.0.0.0
# after read a 20MB excel
root 3752960 1.7 4.2 1914828 853372 pts/0 Sl+ 19:52 0:22 \_ /aipaas/.venv/bin/python /aipaas/.venv/bin/uvicorn --app-dir /aipaas/src aipaas.main:app --host 0.0.0.0
and didnot go down after a long time like one hour
- memory_profiler
=============================================================
33 725.4 MiB 725.4 MiB 1 @profile
34 def get_table_header(self) -> StructureInfo:
35 725.4 MiB 0.0 MiB 1 _, source = self._get_extension_and_source()
36
37 # with pd.ExcelFile(source, engine="calamine") as workbook:
38 # sheets = workbook.sheet_names
39 #
40 # column_info: dict[str, ColumnList] = {}
41 # for sheet_name in sheets:
42 # df = workbook.parse(sheet_name=sheet_name, header=None, dtype=str, na_filter=False, **self.excel_args)
43 # column_info[sheet_name] = self._get_table_header(df)
44 # del df
45 # break # 仅返回第一个sheet
46 725.4 MiB 0.0 MiB 1 import sys
47 725.6 MiB 0.2 MiB 1 import python_calamine as calamine
48 793.0 MiB 67.4 MiB 1 workbook = calamine.load_workbook(source)
49 793.0 MiB 0.0 MiB 1 print(f"workbook size: {sys.getsizeof(workbook)}")
50 793.0 MiB 0.0 MiB 1 sheets = workbook.sheet_names
51
52 793.0 MiB 0.0 MiB 1 column_info: dict[str, ColumnList] = {}
53 793.0 MiB 0.0 MiB 1 for sheet_name in sheets:
54 # 读取工作表中的数据
55 796.8 MiB 3.8 MiB 1 sheet = workbook.get_sheet_by_name(sheet_name)
56 796.8 MiB 0.0 MiB 1 print(f"sheet size: {sys.getsizeof(sheet)}")
57 796.8 MiB 0.0 MiB 1 data = sheet.iter_rows()
58 796.8 MiB 0.0 MiB 1 header = next(data)
59 860.0 MiB 63.2 MiB 1 df = pd.DataFrame(data, columns=header)
60 860.0 MiB 0.0 MiB 1 print(f"df size: {sys.getsizeof(df)}")
61 # 获取列信息(表头)
62 860.0 MiB 0.0 MiB 1 column_info[sheet_name] = self._get_table_header(df)
63 853.2 MiB -6.8 MiB 1 del sheet, df
64 853.2 MiB 0.0 MiB 1 break # 如果只需要处理一个工作表,使用break退出循环
65 853.2 MiB 0.0 MiB 1 workbook.close()
66 853.2 MiB 0.0 MiB 1 del workbook
67
68 853.2 MiB 0.0 MiB 1 return StructureInfo(sheets=sheets, column_info=column_info)
ps. source is io.BytesIO
I'd like to piggy-back off of this thread. I am also experiencing similar memory leak issues. I have 4 concurrent workers in one pod and the memory usage trends up until it reaches the max of 16GB. I'm working with some large excel files and even with one worker it will eventually trend up to 16GB. I wasn't getting this issue with pandas and openpyxl so I'm wondering if there is an issue with this package. I am on python 3.9.5 and python-calamine 0.4.0
@pablo-voltaiq hi. Could you provide example code? Also see https://github.com/dimastbk/python-calamine/issues/42.
@dimastbk Thanks for getting back, this is the core of the code I am using. I saw that calamine will have to read the entirety of the excel file, which I am fine with, but the issue arose afterwards when I was converting multiple files to csv the memory allocation would not go down
import pandas as pd
from python_calamine import CalamineWorkbook
def read_excel_calamine_chunks(sheet, chunk_size=262_144):
chunk = []
for row in sheet.iter_rows():
chunk.append(row)
if len(chunk) >= chunk_size:
yield pd.DataFrame(chunk)
chunk = []
if chunk:
yield pd.DataFrame(chunk)
def convert_to_csv():
file_path = "large_excel_file.xlsx"
output_path = "output.csv"
sheet_name = "Sheet1"
with CalamineWorkbook.from_path(file_path) as workbook, open(
output_path, "w", encoding="utf-8", newline=""
) as fout:
sheet = workbook.get_sheet_by_name(sheet_name)
for df in read_excel_calamine_chunks(sheet):
df.to_csv(fout)
if __name__ == "__main__":
convert_to_csv()