Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def infer():
args = parse_args()
paddle.init(use_gpu=False, trainer_count=1)
model = DeepFM(args.factor_size, infer=True)
parameters = paddle.parameters.Parameters.from_tar(
gzip.open(args.model_gz_path, 'r'))
inferer = paddle.inference.Inference(
output_layer=model, parameters=parameters)
dataset = reader.Dataset()
infer_reader = paddle.batch(dataset.infer(args.data_path), batch_size=1000)
with open(args.prediction_output_path, 'w') as out:
for id, batch in enumerate(infer_reader()):
res = inferer.infer(input=batch)
predictions = [x for x in itertools.chain.from_iterable(res)]
out.write('\n'.join(map(str, predictions)) + '\n')
validation_filename = os.path.join(data_dir, "validation.npy")
test_filename = os.path.join(data_dir, "test.npy")
if os.path.exists(train_filename) and os.path.exists(
validation_filename) and os.path.exists(test_filename):
train_data = np.load(train_filename)
validation_data = np.load(validation_filename)
test_data = np.load(test_filename)
else:
print(
"Data does NOT exist, please check directory if exists and run split_dataset.py before train."
)
exit(0)
train_images, train_labels = zip(*train_data)
self._train = Dataset(train_images, train_labels)
validation_images, validation_labels = zip(*validation_data)
self._validation = Dataset(validation_images, validation_labels)
test_images, test_labels = zip(*test_data)
self._test = Dataset(test_images, test_labels)
train DSSM
"""
default_train_paths = ["./data/classification/train/right.txt",
"./data/classification/train/wrong.txt"]
default_test_paths = ["./data/classification/test/right.txt",
"./data/classification/test/wrong.txt"]
default_dic_path = "./data/vocab.txt"
layer_dims = [int(i) for i in config.config['dnn_dims'].split(',')]
use_default_data = not train_data_paths
if use_default_data:
train_data_paths = default_train_paths
test_data_paths = default_test_paths
source_dic_path = default_dic_path
target_dic_path = default_dic_path
dataset = reader.Dataset(
train_paths=train_data_paths,
test_paths=test_data_paths,
source_dic_path=source_dic_path,
target_dic_path=target_dic_path
)
train_reader = paddle.batch(paddle.reader.shuffle(dataset.train, buf_size=1000),
batch_size=batch_size)
test_reader = paddle.batch(paddle.reader.shuffle(dataset.test, buf_size=1000),
batch_size=batch_size)
paddle.init(use_gpu=use_gpu, trainer_count=num_workers)
# DSSM
cost, prediction, label = DSSM(
dnn_dims=layer_dims,
vocab_sizes=[len(load_dic(path)) for path in [source_dic_path, target_dic_path]],
if not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir)
paddle.init(use_gpu=False, trainer_count=1)
optimizer = paddle.optimizer.Adam(learning_rate=1e-4)
model = DeepFM(args.factor_size)
params = paddle.parameters.create(model)
trainer = paddle.trainer.SGD(cost=model,
parameters=params,
update_equation=optimizer)
dataset = reader.Dataset()
def __event_handler__(event):
if isinstance(event, paddle.event.EndIteration):
num_samples = event.batch_id * args.batch_size
if event.batch_id % 100 == 0:
logger.warning("Pass %d, Batch %d, Samples %d, Cost %f, %s" %
(event.pass_id, event.batch_id, num_samples,
event.cost, event.metrics))
if event.batch_id % 10000 == 0:
if args.test_data_path:
result = trainer.test(
reader=paddle.batch(
dataset.test(args.test_data_path),
batch_size=args.batch_size),
feeding=reader.feeding)
def do_eval(args):
dataset = reader.Dataset(args)
test_program = fluid.Program()
with fluid.program_guard(test_program, fluid.default_startup_program()):
with fluid.unique_name.guard():
test_ret = creator.create_model(
args, dataset.vocab_size, dataset.num_labels, mode='test')
test_program = test_program.clone(for_test=True)
# init executor
if args.use_cuda:
place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
else:
place = fluid.CPUPlace()
pyreader = creator.create_pyreader(args, file_name=args.test_data,
feed_list=test_ret['feed_list'],
default_train_path = "./data/rank/train.txt"
default_test_path = "./data/rank/test.txt"
default_dic_path = "./data/vocab.txt"
if not model_type.is_rank():
default_train_path = "./data/classification/train.txt"
default_test_path = "./data/classification/test.txt"
use_default_data = not train_data_path
if use_default_data:
train_data_path = default_train_path
test_data_path = default_test_path
source_dic_path = default_dic_path
target_dic_path = default_dic_path
dataset = reader.Dataset(
train_path=train_data_path,
test_path=test_data_path,
source_dic_path=source_dic_path,
target_dic_path=target_dic_path,
model_type=model_type, )
train_reader = paddle.batch(
paddle.reader.shuffle(
dataset.train, buf_size=1000),
batch_size=batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
dataset.test, buf_size=1000),
batch_size=batch_size)
def save_inference_model(args):
# model definition
if args.use_cuda:
place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
else:
place = fluid.CPUPlace()
dataset = reader.Dataset(args)
infer_program = fluid.Program()
with fluid.program_guard(infer_program, fluid.default_startup_program()):
with fluid.unique_name.guard():
infer_ret = creator.create_model(
args, dataset.vocab_size, dataset.num_labels, mode='infer')
infer_program = infer_program.clone(for_test=True)
# load pretrain check point
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
utils.init_checkpoint(exe, args.init_checkpoint, infer_program)
fluid.io.save_inference_model(args.inference_save_dir,
['words'],
def infer(self, data_path):
dataset = reader.Dataset(
train_path=data_path,
test_path=None,
source_dic_path=args.source_dic_path,
target_dic_path=args.target_dic_path,
model_type=args.model_type, )
infer_reader = paddle.batch(dataset.infer, batch_size=1000)
logger.warning("Write predictions to %s." % args.prediction_output_path)
output_f = open(args.prediction_output_path, "w")
for id, batch in enumerate(infer_reader()):
res = self.inferer.infer(input=batch)
predictions = [" ".join(map(str, x)) for x in res]
assert len(batch) == len(predictions), (
"Error! %d inputs are given, "
"but only %d predictions are returned.") % (len(batch),
def infer(self, data_path):
logger.info("infer data...")
dataset = reader.Dataset()
infer_reader = paddle.batch(
dataset.infer(args.data_path), batch_size=1000)
logger.warning('write predictions to %s' % args.prediction_output_path)
output_f = open(args.prediction_output_path, 'w')
for id, batch in enumerate(infer_reader()):
res = self.inferer.infer(input=batch)
predictions = [x for x in itertools.chain.from_iterable(res)]
assert len(batch) == len(
predictions), "predict error, %d inputs, but %d predictions" % (
len(batch), len(predictions))
output_f.write('\n'.join(map(str, predictions)) + '\n')
def main(args):
# Read (and optionally, truncate) the training and validation data.
train_data = Dataset.from_path(args.train_path)
if args.max_train_chunks is not None:
size = args.max_train_chunks * args.chunk_size
train_data.truncate_seqs(size)
valid_data = Dataset.from_path(args.valid_path)
if args.max_valid_chunks is not None:
size = args.max_valid_chunks * args.chunk_size
valid_data.truncate_seqs(size, keep_first=True)
num_users = train_data.num_users
num_items = train_data.num_items
tot_size = train_data.num_triplets + valid_data.num_triplets
train_data.prepare_batches(args.chunk_size, args.batch_size)
valid_data.prepare_batches(args.chunk_size, args.batch_size,
batches_like=train_data)
settings = {
"chunk_size": args.chunk_size,
"batch_size": args.batch_size,
"hidden_size": args.hidden_size,