deepspeed FastChat/fastchat/train/train_lora.py \ --model_name_or_path /path/to/llama-2-13b-chat/ \ --lora_r 64 \ --lora_alpha 16 \ --lora_dropout 0.1 \ --data_path ./data/fine_tuning_data_dila_v4.json \ --output_dir ./data/models/dtnum_13b_dila_v4 \ --num_train_epochs 10 \ --fp16 True \ --per_device_train_batch_size 3 \ --per_device_eval_batch_size 3 \ --gradient_accumulation_steps 4 \ --evaluation_strategy "no" \ --save_strategy "steps" \ --save_steps 240 \ --save_total_limit 10 \ --learning_rate 2e-4 \ --weight_decay 0.001 \ --warmup_ratio 0.03 \ --lr_scheduler_type "cosine" \ --logging_strategy "steps" \ --logging_steps 1 \ --tf32 True \ --model_max_length 4096 \ --q_lora False \ --deepspeed ./FastChat/FastChat/playground/deepspeed_config_s2.json \ --gradient_checkpointing True \ --flash_attn False