@article{VACACASTANO20191, title = {Holistic object detection and image understanding}, journal = {Computer Vision and Image Understanding}, volume = {181}, pages = {1-13}, year = {2019}, issn = {1077-3142}, doi = {https://doi.org/10.1016/j.cviu.2019.02.006}, url = {https://www.sciencedirect.com/science/article/pii/S1077314219300219}, author = {Gonzalo Vaca-Castano and Niels DaVitoria Lobo and Mubarak Shah}, keywords = {Computer vision, Image representation, Object detection}, abstract = {This paper proposes a new representation of the visual content of an image that allows learning about what elements are part of an image and the hierarchical structure that they form. Our representation is a Top-Down Visual-Tree, where every node represents a bounding box, label, and visual feature of an object existing in the image. Each image and its object annotations from a training dataset are parsed to obtain the proposed visual representation. These images and their parsed tree representations are trained using a Top-Down Tree LSTM (Long Short Term Memory) network. The encoded information, allows integrate object detection and image understanding in a single process. The presented holistic object detection is not agnostic to the overall content of the image, and it is influenced by the image composition and the parts discovered. During testing time, from an image, we are able to infer the most prominent type of objects and their locations, the parts of these objects, and having a proper understanding of the image content through the obtained Top-Down Visual-Tree representation output. The accuracy of our object detection process increases notably respect to the baseline Fast R-CNN method in the visual genome test dataset.} }