I am trying to use the Google cloud to run a script that makes predictions for every line of a test.csv
file. I use the cloud because it looks like Google Colab is going to take some time. However, when I run it there is a memory error:
(pre_env) mikempc3@instance-1:~$ python predictSales.py
Traceback (most recent call last):
File "predictSales.py", line 7, in <module>
sales = pd.read_csv("sales_train.csv")
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/io/parsers.py", line 685, in parser_f
return _read(filepath_or_buffer, kwds)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/io/parsers.py", line 463, in _read
data = parser.read(nrows)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/io/parsers.py", line 1169, in read
df = DataFrame(col_dict, columns=columns, index=index)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/frame.py", line 411, in __init__
mgr = init_dict(data, index, columns, dtype=dtype)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/construction.py", line 257, in init_dict
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/construction.py", line 87, in arrays_to_mgr
return create_block_manager_from_arrays(arrays, arr_names, axes)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/managers.py", line 1694, in create_block_manager_from_arrays
blocks = form_blocks(arrays, names, axes)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/managers.py", line 1764, in form_blocks
int_blocks = _multi_blockify(items_dict["IntBlock"])
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/managers.py", line 1846, in _multi_blockify
values, placement = _stack_arrays(list(tup_block), dtype)
File "/home/mikempc3/pre_env/lib/python3.5/site-packages/pandas/core/internals/managers.py", line 1874, in _stack_arrays
stacked = np.empty(shape, dtype=dtype)
MemoryError: Unable to allocate 67.2 MiB for an array with shape (3, 2935849) and data type int64
Here is my script:
import statsmodels.tsa.arima.model as smt
import pandas as pd
import datetime
import numpy as np
sales = pd.read_csv("sales_train.csv")
test = pd.read_csv("test.csv")
sales.date = sales.date.apply(lambda x: datetime.datetime.strptime(x, "%d.%m.%Y"))
sales_monthly = sales.groupby(
["date_block_num", "shop_id", "item_id"])["date", "item_price",
"item_cnt_day"].agg({
"date": ["min", "max"],
"item_price": "mean",
"item_cnt_day": "sum"})
array = []
for i, row in test.iterrows():
print("row['shop_id']: ", row['shop_id'], " row['item_id']: ", row['item_id'])
print(statsmodels.__version__)
ts = pd.DataFrame(sales_monthly.loc[pd.IndexSlice[:, [row['shop_id']], [row['item_id']]], :]['item_price'].values *
sales_monthly.loc[pd.IndexSlice[:, [row['shop_id']], [row['item_id']]], :][
'item_cnt_day'].values).T.iloc[0]
print(ts.values)
if ts.values != [] and len(ts.values) > 2:
best_aic = np.inf
best_order = None
best_model = None
ranges = range(1, 5)
for difference in ranges:
# try:
tmp_model = smt.ARIMA(ts.values, order=(0, 1, 0), trend='t').fit()
tmp_aic = tmp_model.aic
if tmp_aic < best_aic:
best_aic = tmp_aic
best_difference = difference
best_model = tmp_model
# except Exception as e:
# print(e)
# continue
if best_model is not None:
y_hat = best_model.forecast()[0]
if y_hat < 0:
y_hat = 0
else:
y_hat = 0
else:
y_hat = 0
print("predicted:", y_hat)
d = {'id': row['ID'], 'item_cnt_month': y_hat}
array.append(d)
print("-------------------")
df = pd.DataFrame(array)
df.to_csv("submission.csv")
question from:
https://stackoverflow.com/questions/65937287/memoryerror-when-running-python-script-on-google-cloud