Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_groupby_large_random_integers(seed):
random.seed(seed)
ngrps1 = random.choice([1, 1, 2, 2, 2, 3, 4, 5])
n0 = 1 << random.choice([1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6, 7])
chunks = ([random.sample(range(n0), random.randint(1, n0))] +
[random.sample([0] * 100 + list(range(256)),
random.randint(1, 20))
for i in range(ngrps1)])
n = int(random.expovariate(0.0001)) + 10
sample = [sum(random.choice(chunks[i]) << (8 * i)
for i in range(len(chunks)))
for _ in range(n)]
nuniques = len(set(sample))
f0 = dt.Frame(sample)
assert f0.nunique1() == nuniques
f1 = dt.rbind(*([f0] * random.randint(2, 20)))
assert f1.nunique1() == nuniques
def test_groupby_with_filter1():
f0 = dt.Frame(KEY=[1, 2, 1, 2, 1, 2], X=[-10, 2, 3, 0, 1, -7])
f1 = f0[f.X > 0, sum(f.X), f.KEY]
assert f1.to_list() == [[1, 2], [4, 2]]
def test_sum_empty_frame():
DT = dt.Frame([[]] * 4, names=list("ABCD"),
stypes=(dt.bool8, dt.int32, dt.float32, dt.float64))
assert DT.shape == (0, 4)
RZ = DT[:, sum(f[:])]
frame_integrity_check(RZ)
assert RZ.shape == (1, 4)
assert RZ.names == ("A", "B", "C", "D")
assert RZ.stypes == (dt.int64, dt.int64, dt.float32, dt.float64)
assert RZ.to_list() == [[0], [0], [0], [0]]
assert str(RZ)
def test_group_empty_frame4():
DT = dt.Frame(A=[], stype=dt.float32)
D2 = DT[:, sum(f.A), by(f.A)]
frame_integrity_check(D2)
assert D2.shape == (0, 2)
assert D2.stypes == (dt.float32, dt.float32)
chk = ans[:, sum(f.v1)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
del ans
question = "sum v1 mean v3 by id3" # q3
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1), "v3": mean(f.v3)}, by(f.id3)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v3)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1), "v3": mean(f.v3)}, by(f.id3)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v3)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
del ans
def transform(self, X: dt.Frame):
return X[:, dt.sum([(dt.f[x] < 0) for x in range(X.ncols)])]
"sum": lambda expr: datatable.sum(expr_to_dt_expr(expr.args[0])),
"+": lambda expr: expr_to_dt_expr(expr.args[0]) + expr_to_dt_expr(expr.args[1]),
def transform(self, X: dt.Frame):
return X[:, dt.sum([dt.f[x] for x in range(X.ncols)])]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.range_v1_v2)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"range_v1_v2": max(f.v1)-min(f.v2)}, by(f.id3)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.range_v1_v2)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
del ans
question = "largest two v3 by id6" # q8
gc.collect()
t_start = timeit.default_timer()
ans = x[:2, {"largest2_v3": f.v3}, by(f.id6), sort(-f.v3)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.largest2_v3)]
chkt = timeit.default_timer() - t_start