Recognizer3D(
(backbone): TimeSformer(
(patch_embed): PatchEmbed(
(projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
)
(drop_after_pos): Dropout(p=0.0, inplace=False)
(drop_after_time): Dropout(p=0.0, inplace=False)
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(transformer_layers): TransformerLayerSequence(
(layers): ModuleList(
(0): BaseTransformerLayer(
(attentions): ModuleList(
(0): DividedTemporalAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
(temporal_fc): Linear(in_features=768, out_features=768, bias=True)
)
(1): DividedSpatialAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
)
)
(ffns): ModuleList(
(0): FFNWithNorm(
(activate): GELU()
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
)
(1): Linear(in_features=3072, out_features=768, bias=True)
(2): Dropout(p=0.0, inplace=False)
)
(dropout_layer): DropPath()
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
)
)
(norms): ModuleList()
)
(1): BaseTransformerLayer(
(attentions): ModuleList(
(0): DividedTemporalAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
(temporal_fc): Linear(in_features=768, out_features=768, bias=True)
)
(1): DividedSpatialAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
)
)
(ffns): ModuleList(
(0): FFNWithNorm(
(activate): GELU()
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
)
(1): Linear(in_features=3072, out_features=768, bias=True)
(2): Dropout(p=0.0, inplace=False)
)
(dropout_layer): DropPath()
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
)
)
(norms): ModuleList()
)
(2): BaseTransformerLayer(
(attentions): ModuleList(
(0): DividedTemporalAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
(temporal_fc): Linear(in_features=768, out_features=768, bias=True)
)
(1): DividedSpatialAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
)
)
(ffns): ModuleList(
(0): FFNWithNorm(
(activate): GELU()
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
)
(1): Linear(in_features=3072, out_features=768, bias=True)
(2): Dropout(p=0.0, inplace=False)
)
(dropout_layer): DropPath()
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
)
)
(norms): ModuleList()
)
(3): BaseTransformerLayer(
(attentions): ModuleList(
(0): DividedTemporalAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
(temporal_fc): Linear(in_features=768, out_features=768, bias=True)
)
(1): DividedSpatialAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
)
)
(ffns): ModuleList(
(0): FFNWithNorm(
(activate): GELU()
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
)
(1): Linear(in_features=3072, out_features=768, bias=True)
(2): Dropout(p=0.0, inplace=False)
)
(dropout_layer): DropPath()
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
)
)
(norms): ModuleList()
)
(4): BaseTransformerLayer(
(attentions): ModuleList(
(0): DividedTemporalAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
(temporal_fc): Linear(in_features=768, out_features=768, bias=True)
)
(1): DividedSpatialAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
)
)
(ffns): ModuleList(
(0): FFNWithNorm(
(activate): GELU()
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
)
(1): Linear(in_features=3072, out_features=768, bias=True)
(2): Dropout(p=0.0, inplace=False)
)
(dropout_layer): DropPath()
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
)
)
(norms): ModuleList()
)
(5): BaseTransformerLayer(
(attentions): ModuleList(
(0): DividedTemporalAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
(temporal_fc): Linear(in_features=768, out_features=768, bias=True)
)
(1): DividedSpatialAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
)
)
(ffns): ModuleList(
(0): FFNWithNorm(
(activate): GELU()
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
)
(1): Linear(in_features=3072, out_features=768, bias=True)
(2): Dropout(p=0.0, inplace=False)
)
(dropout_layer): DropPath()
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
)
)
(norms): ModuleList()
)
(6): BaseTransformerLayer(
(attentions): ModuleList(
(0): DividedTemporalAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
(temporal_fc): Linear(in_features=768, out_features=768, bias=True)
)
(1): DividedSpatialAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
)
)
(ffns): ModuleList(
(0): FFNWithNorm(
(activate): GELU()
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
)
(1): Linear(in_features=3072, out_features=768, bias=True)
(2): Dropout(p=0.0, inplace=False)
)
(dropout_layer): DropPath()
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
)
)
(norms): ModuleList()
)
(7): BaseTransformerLayer(
(attentions): ModuleList(
(0): DividedTemporalAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
(temporal_fc): Linear(in_features=768, out_features=768, bias=True)
)
(1): DividedSpatialAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
)
)
(ffns): ModuleList(
(0): FFNWithNorm(
(activate): GELU()
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
)
(1): Linear(in_features=3072, out_features=768, bias=True)
(2): Dropout(p=0.0, inplace=False)
)
(dropout_layer): DropPath()
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
)
)
(norms): ModuleList()
)
(8): BaseTransformerLayer(
(attentions): ModuleList(
(0): DividedTemporalAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
(temporal_fc): Linear(in_features=768, out_features=768, bias=True)
)
(1): DividedSpatialAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
)
)
(ffns): ModuleList(
(0): FFNWithNorm(
(activate): GELU()
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
)
(1): Linear(in_features=3072, out_features=768, bias=True)
(2): Dropout(p=0.0, inplace=False)
)
(dropout_layer): DropPath()
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
)
)
(norms): ModuleList()
)
(9): BaseTransformerLayer(
(attentions): ModuleList(
(0): DividedTemporalAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
(temporal_fc): Linear(in_features=768, out_features=768, bias=True)
)
(1): DividedSpatialAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
)
)
(ffns): ModuleList(
(0): FFNWithNorm(
(activate): GELU()
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
)
(1): Linear(in_features=3072, out_features=768, bias=True)
(2): Dropout(p=0.0, inplace=False)
)
(dropout_layer): DropPath()
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
)
)
(norms): ModuleList()
)
(10): BaseTransformerLayer(
(attentions): ModuleList(
(0): DividedTemporalAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
(temporal_fc): Linear(in_features=768, out_features=768, bias=True)
)
(1): DividedSpatialAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
)
)
(ffns): ModuleList(
(0): FFNWithNorm(
(activate): GELU()
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
)
(1): Linear(in_features=3072, out_features=768, bias=True)
(2): Dropout(p=0.0, inplace=False)
)
(dropout_layer): DropPath()
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
)
)
(norms): ModuleList()
)
(11): BaseTransformerLayer(
(attentions): ModuleList(
(0): DividedTemporalAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
(temporal_fc): Linear(in_features=768, out_features=768, bias=True)
)
(1): DividedSpatialAttentionWithNorm(
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(proj_drop): Dropout(p=0.0, inplace=False)
(dropout_layer): DropPath()
)
)
(ffns): ModuleList(
(0): FFNWithNorm(
(activate): GELU()
(layers): Sequential(
(0): Sequential(
(0): Linear(in_features=768, out_features=3072, bias=True)
(1): GELU()
(2): Dropout(p=0.0, inplace=False)
)
(1): Linear(in_features=3072, out_features=768, bias=True)
(2): Dropout(p=0.0, inplace=False)
)
(dropout_layer): DropPath()
(norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
)
)
(norms): ModuleList()
)
)
)
)
(cls_head): TimeSformerHead(
(loss_cls): CrossEntropyLoss()
(fc_cls): Linear(in_features=768, out_features=400, bias=True)
)
)
|