image_finetune: false output_dir: "outputs" pretrained_model_path: "models/StableDiffusion/stable-diffusion-v1-5" unet_additional_kwargs: use_motion_module : true motion_module_resolutions : [ 1,2,4,8 ] unet_use_cross_frame_attention : false unet_use_temporal_attention : false motion_module_type: Vanilla motion_module_kwargs: num_attention_heads : 8 num_transformer_block : 1 attention_block_types : [ "Temporal_Self", "Temporal_Self" ] temporal_position_encoding : true temporal_position_encoding_max_len : 32 temporal_attention_dim_div : 1 zero_initialize : true noise_scheduler_kwargs: num_train_timesteps: 1000 beta_start: 0.00085 beta_end: 0.012 beta_schedule: "linear" steps_offset: 1 clip_sample: false train_data: csv_path: "/root/autodl-tmp/zoom_in_24.csv" video_folder: "/root/autodl-tmp/data" sample_size: 256 sample_stride: 4 sample_n_frames: 16 validation_data: prompts: - "Snow rocky mountains peaks canyon. Snow blanketed rocky mountains surround and shadow deep canyons." - "A drone view of celebration with Christma tree and fireworks, starry sky - background." - "Robot dancing in times square." - "Pacific coast, carmel by the sea ocean and waves." num_inference_steps: 25 guidance_scale: 8. trainable_modules: - "motion_modules." unet_checkpoint_path: "/root/autodl-tmp/mm_sd_v15_v2.ckpt" learning_rate: 1.e-4 train_batch_size: 1 max_train_epoch: 1000 max_train_steps: 50000 checkpointing_epochs: 10 checkpointing_steps: 60 validation_steps: 5000 validation_steps_tuple: [2, 50] global_seed: 42 mixed_precision_training: true enable_xformers_memory_efficient_attention: True is_debug: False