@inproceedings{yan2024buffalo, author = {Yan, Bingjie and Chen, Qian and Chen, Yiqiang and Jiang, Xinlong and Huang, Wuliang and Wang, Bingyu and Wang, Zhirui and Gao, Chenlong and Zhang, Teng}, title = {Buffalo: Biomedical Vision-Language Understanding with Cross-Modal Prototype and Federated Foundation Model Collaboration}, year = {2024}, isbn = {9798400704369}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://doi.org/10.1145/3627673.3679627}, doi = {10.1145/3627673.3679627}, abstract = {Federated learning (FL) enables collaborative learning across multiple biomedical data silos with multimodal foundation models while preserving privacy. Due to the heterogeneity in data processing and collection methodologies across diverse medical institutions and the varying medical inspections patients undergo, modal heterogeneity exists in practical scenarios, where severe modal heterogeneity may even prevent model training. With privacy considerations, data transfer cannot be permitted, restricting knowledge exchange among different clients. To trickle these issues, we propose a cross-modal prototype imputation method for visual-language understanding (Buffalo) with only a slight increase in communication cost, which can improve the performance of fine-tuning general foundation models for downstream biomedical tasks. We conducted extensive experiments on medical report generation and biomedical visual question-answering tasks. The results demonstrate that Buffalo can fully utilize data from all clients to improve model generalization compared to other modal imputation methods in three modal heterogeneity scenarios, approaching or even surpassing the performance in the ideal scenario without missing modality.}, booktitle = {Proceedings of the 33rd ACM International Conference on Information and Knowledge Management}, pages = {2775–2785}, numpages = {11}, keywords = {biomedical vision-language understanding, cross-modal prototype, federated learning, modal heterogeneity, multi-modal}, location = {Boise, ID, USA}, series = {CIKM '24} }