Abstract: We construct multi-modal concept representations by concatenating a skip-gram linguistic representation vector with a visual concept representation vector computed using the feature extraction layers of a deep convolutional neural network (CNN) trained on a large labeled object recognition dataset. This transfer learning approach brings a clear performance gain over features based on the traditional bag-of-visual-word approach. Experimental results are reported on the WordSim353 and MEN semantic relatedness evaluation tasks. We use visual features computed using either ImageNet or ESP Game images.
@inproceedings{kiela-bottou-2014,
  author = {Kiela, Douwe and Bottou, L\'{e}on},
  title = {Learning Image Embeddings using Convolutional Neural Networks for Improved Multi-Modal Semantics},
  booktitle = {Proceedings of EMNLP 2014},
  year = {2014},
  address = {Doha, Qatar},
  url = {http://leon.bottou.org/papers/kiela-bottou-2014},
}