@article{MC281538A, title = "Multi-Modal Sensing-Assisted Beam Prediction for UAV Communications", journal = "The Journal of Korean Institute of Communications and Information Sciences", year = "2024", issn = "1226-4717", doi = "10.7840/kics.2024.49.9.1330", author = "Yerin Yeo, Junghyun Kim", keywords = "Beam prediction, Deep learning, Transformer, Multi-modal learning, Wireless communications", abstract = "In this paper, we propose a deep learning model to predict the optimal beam for wireless communication systems by utilizing both camera image data and GPS data, enabling efficient beamforming. Existing work has proposed single-modal beam prediction models that utilize camera image data and GPS data individually. However, these models have limitations in that they are sensitive to measurement environments and outliers. To overcome the limitations, we propose a new model that combines and utilizes the two types of data based on a derivative model of Transformer called Vision Transformer. Experimental results show that the proposed model exhibits higher performance in terms of Top-1, 2, 3 accuracy for both 32-beam and 64-beam scenarios compared to the existing model. Particularly, the Top-3 accuracy of the proposed model showed nearly 100% accuracy in both scenarios." }