author = "Cardenas, Edwin Jonathan Escobedo",
          affiliation = "{Federal University of Ouro Preto}",
                title = "Multimodal Human Action Recognition Based on a Fusion of Dynamic 
                         Images using CNN descriptors",
            booktitle = "Proceedings...",
                 year = "2018",
               editor = "Ross, Arun and Gastal, Eduardo S. L. and Jorge, Joaquim A. and 
                         Queiroz, Ricardo L. de and Minetto, Rodrigo and Sarkar, Sudeep and 
                         Papa, Jo{\~a}o Paulo and Oliveira, Manuel M. and Arbel{\'a}ez, 
                         Pablo and Mery, Domingo and Oliveira, Maria Cristina Ferreira de 
                         and Spina, Thiago Vallin and Mendes, Caroline Mazetto and Costa, 
                         Henrique S{\'e}rgio Gutierrez and Mejail, Marta Estela and Geus, 
                         Klaus de and Scheer, Sergio",
         organization = "Conference on Graphics, Patterns and Images, 31. (SIBGRAPI)",
            publisher = "IEEE Computer Society",
              address = "Los Alamitos",
             keywords = "action recognition, dynamic images, RGB-D data, kinect, CNN.",
             abstract = "In this paper, we propose the use of dynamic-images-based approach 
                         for action recognition. Specifically, we exploit the multimodal 
                         information recorded by a Kinect sensor (RGB-D and skeleton joint 
                         data). We combine several ideas from rank pooling and skeleton 
                         optical spectra to generate dynamic images to summarize an action 
                         sequence into single flow images. We group our dynamic images into 
                         five groups: a dynamic color group (DC); a dynamic depth group 
                         (DD) and three dynamic skeleton groups (DXY, DYZ, DXZ). As action 
                         is composed of different postures along time, we generated N 
                         different dynamic images with the main postures for each dynamic 
                         group. Next, we applied a pre-trained flow-CNN to extract 
                         spatiotemporal features with a max-mean aggregation. The proposed 
                         method was evaluated on a public benchmark dataset, the UTD-MHAD, 
                         and achieved the state-of-the-art result.",
  conference-location = "Foz do Igua{\c{c}}u, PR, Brazil",
      conference-year = "Oct. 29 - Nov. 1, 2018",
             language = "en",
           targetfile = "
        urlaccessdate = "2020, Dec. 02"