author = "Duarte, Leonardo Assuane and Penatti, Ot{\'a}vio Augusto Bizetto 
                         and Almeida, Jurandy",
          affiliation = "{Universidade Federal de S{\~a}o Paulo - UNIFESP} and {SAMSUNG 
                         Research Institute} and {Universidade Federal de S{\~a}o Paulo - 
                title = "Bag of attributes for video event retrieval",
            booktitle = "Proceedings...",
                 year = "2018",
               editor = "Ross, Arun and Gastal, Eduardo S. L. and Jorge, Joaquim A. and 
                         Queiroz, Ricardo L. de and Minetto, Rodrigo and Sarkar, Sudeep and 
                         Papa, Jo{\~a}o Paulo and Oliveira, Manuel M. and Arbel{\'a}ez, 
                         Pablo and Mery, Domingo and Oliveira, Maria Cristina Ferreira de 
                         and Spina, Thiago Vallin and Mendes, Caroline Mazetto and Costa, 
                         Henrique S{\'e}rgio Gutierrez and Mejail, Marta Estela and Geus, 
                         Klaus de and Scheer, Sergio",
         organization = "Conference on Graphics, Patterns and Images, 31. (SIBGRAPI)",
            publisher = "IEEE Computer Society",
              address = "Los Alamitos",
             keywords = "video event retrieval, video representation, visual dictionaries, 
             abstract = "In this paper, we present the Bag-of-Attributes (BoA) model for 
                         video representation aiming at video event retrieval. The BoA 
                         model is based on a semantic feature space for representing 
                         videos, resulting in high-level video feature vectors. For 
                         creating a semantic space, i.e., the attribute space, we can train 
                         a classifier using a labeled image dataset, obtaining a 
                         classification model that can be understood as a high-level 
                         codebook. This model is used to map low-level frame vectors into 
                         high-level vectors (e.g., classifier probability scores). Then, we 
                         apply pooling operations to the frame vectors to create the final 
                         bag of attributes for the video. In the BoA representation, each 
                         dimension corresponds to one category (or attribute) of the 
                         semantic space. Other interesting properties are: compactness, 
                         flexibility regarding the classifier, and ability to encode 
                         multiple semantic concepts in a single video representation. Our 
                         experiments considered the semantic space created by 
                         state-of-the-art convolutional neural networks pre-trained on 1000 
                         object categories of ImageNet. Such deep neural networks were used 
                         to classify each video frame and then different coding strategies 
                         were used to encode the probability distribution from the softmax 
                         layer into a frame vector. Next, different pooling strategies were 
                         used to combine frame vectors in the BoA representation for a 
                         video. Results using BoA were comparable or superior to the 
                         baselines in the task of video event retrieval using the EVVE 
                         dataset, with the advantage of providing a much more compact 
  conference-location = "Foz do Igua{\c{c}}u, PR, Brazil",
      conference-year = "Oct. 29 - Nov. 1, 2018",
             language = "en",
           targetfile = "59paper.pdf",
        urlaccessdate = "2020, Dec. 04"