@inproceedings{tao2026omnizip,title={OmniZip: Audio-Guided Dynamic Token Compression for Fast Omnimodal Large Language Models},author={Tao, Keda and Shao, Kele and Yu, Bohan and Wang, Weiqiang and Liu, Jian and Wang, Huan},booktitle={IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},year={2026},}
CVPR
StreamingTOM: Streaming Token Compression for Efficient Video Understanding
@inproceedings{chen2026streamingtom,title={StreamingTOM: Streaming Token Compression for Efficient Video Understanding},author={Chen, Xueyi and Tao, Keda and Shao, Kele and Wang, Huan},booktitle={IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},year={2026},}
CVPR
Can MLLMs Guide Me Home? A Benchmark Study on Fine-Grained Visual Reasoning from Transit Maps
Sicheng Feng, Song Wang, Shuyi Ouyang, Lingdong Kong, Zikai Song, Jianke Zhu, Huan Wang, and Xinchao Wang
In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2026
@inproceedings{feng2026reasonmap,title={Can MLLMs Guide Me Home? A Benchmark Study on Fine-Grained Visual Reasoning from Transit Maps},author={Feng, Sicheng and Wang, Song and Ouyang, Shuyi and Kong, Lingdong and Song, Zikai and Zhu, Jianke and Wang, Huan and Wang, Xinchao},booktitle={IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},year={2026},}
ICLR
MergeMix: A Unified Augmentation Paradigm for Visual and Multi-Modal Understanding
@inproceedings{jin2026mergemix,title={MergeMix: A Unified Augmentation Paradigm for Visual and Multi-Modal Understanding},author={Jin, Xin and Li, Siyuan and Jian, Siyong and Yu, Kai and Wang, Huan},booktitle={International Conference on Learning Representations (ICLR)},year={2026},}
ICLR
OBS-Diff: Accurate Pruning For Diffusion Models in One-Shot
@inproceedings{zhu2026obsdiff,title={OBS-Diff: Accurate Pruning For Diffusion Models in One-Shot},author={Zhu, Junhan and Wang, Hesong and Su, Mingluo and Wang, Zefang and Wang, Huan},booktitle={International Conference on Learning Representations (ICLR)},year={2026},}
ICLR
RewardMap: Tackling Sparse Rewards in Fine-grained Visual Reasoning via Multi-Stage Reinforcement Learning
@inproceedings{feng2026rewardmap,title={RewardMap: Tackling Sparse Rewards in Fine-grained Visual Reasoning via Multi-Stage Reinforcement Learning},author={Feng, Sicheng and Tuo, Kaiwen and Wang, Song and Kong, Lingdong and Zhu, Jianke and Wang, Huan},booktitle={International Conference on Learning Representations (ICLR)},year={2026},}
ICLR
Autoregressive Image Generation with Randomized Parallel Decoding
@inproceedings{li2026arpg,title={Autoregressive Image Generation with Randomized Parallel Decoding},author={Li, Haopeng and Yang, Jinyue and Li, Guoqi and Wang, Huan},booktitle={International Conference on Learning Representations (ICLR)},year={2026},}
CPAL Oral
ROSE: Reordered SparseGPT for More Accurate One-Shot Large Language Models Pruning
@inproceedings{su2026rose,title={ROSE: Reordered SparseGPT for More Accurate One-Shot Large Language Models Pruning},author={Su, Mingluo and Wang, Huan},booktitle={Conference on Parsimony and Learning (CPAL)},year={2026},}
CPAL
ResSVD: Residual Compensated SVD for Large Language Model Compression
@inproceedings{bai2026ressvd,title={ResSVD: Residual Compensated SVD for Large Language Model Compression},author={Bai, Haolei and Jian, Siyong and Liang, Tuo and Yin, Yu and Wang, Huan},booktitle={Conference on Parsimony and Learning (CPAL)},year={2026},}
TMLR
When Tokens Talk Too Much: A Survey of Multimodal Long-Context Token Compression across Images, Videos, and Audios
@article{shao2026tokens,title={When Tokens Talk Too Much: A Survey of Multimodal Long-Context Token Compression across Images, Videos, and Audios},author={Shao, Kele and Tao, Keda and Zhang, Kejia and Feng, Sicheng and Cai, Mu and Shang, Yuzhang and You, Haoxuan and Qin, Can and Sui, Yang and Wang, Huan},journal={Transactions on Machine Learning Research (TMLR)},year={2026},}
arXiv
LVOmniBench: Pioneering Long Audio-Video Understanding Evaluation for Omnimodal LLMs
@article{tao2026lvomnibench,title={LVOmniBench: Pioneering Long Audio-Video Understanding Evaluation for Omnimodal LLMs},author={Tao, Keda and Zheng, Yuhua and Xu, Jia and Du, Wenjie and Shao, Kele and Wang, Hesong and Chen, Xueyi and Jin, Xin and Zhu, Junhan and Yu, Bohan and Wang, Weiqiang and Liu, Jian and Qin, Can and Zhang, Yulun and Yang, Ming-Hsuan and Wang, Huan},journal={arXiv preprint arXiv:2603.19217},year={2026},}
arXiv
MobileKernelBench: Can LLMs Write Efficient Kernels for Mobile Devices?
@article{zou2026mobilekernelbench,title={MobileKernelBench: Can LLMs Write Efficient Kernels for Mobile Devices?},author={Zou, Xingze and Wang, Jing and Zheng, Yuhua and Chen, Xueyi and Bai, Haolei and Kong, Lingcheng and Abu-Bakar, Syed A.R. and Wang, Zhaode and Lv, Chengfei and Hu, Haoji and Wang, Huan},journal={arXiv preprint arXiv:2603.11935},year={2026},}
arXiv
DICE: Diffusion Large Language Models Excel at Generating CUDA Kernels
Haolei Bai, Lingcheng Kong, Xueyi Chen, Jianmian Wang, Zhiqiang Tao, and Huan Wang
@article{bai2026dice,title={DICE: Diffusion Large Language Models Excel at Generating CUDA Kernels},author={Bai, Haolei and Kong, Lingcheng and Chen, Xueyi and Wang, Jianmian and Tao, Zhiqiang and Wang, Huan},journal={arXiv preprint arXiv:2602.11715},year={2026},}
2025
NeurIPS
HoliTom: Holistic Token Merging for Fast Video Large Language Models
@inproceedings{shao2025holitom,title={HoliTom: Holistic Token Merging for Fast Video Large Language Models},author={Shao, Kele and Tao, Keda and Qin, Can and You, Haoxuan and Sui, Yang and Wang, Huan},booktitle={Advances in Neural Information Processing Systems (NeurIPS)},year={2025},}
NeurIPS
Poison as Cure: Visual Noise for Mitigating Object Hallucinations in LVMs
Kejia Zhang, Keda Tao, Jiasheng Tang, and Huan Wang
In Advances in Neural Information Processing Systems (NeurIPS), 2025
@inproceedings{zhang2025poisoncure,title={Poison as Cure: Visual Noise for Mitigating Object Hallucinations in LVMs},author={Zhang, Kejia and Tao, Keda and Tang, Jiasheng and Wang, Huan},booktitle={Advances in Neural Information Processing Systems (NeurIPS)},year={2025},}
NeurIPS
FreqExit: Enabling Early-Exit Inference for Visual Autoregressive Models via Frequency-Aware Guidance
@inproceedings{li2025freqexit,title={FreqExit: Enabling Early-Exit Inference for Visual Autoregressive Models via Frequency-Aware Guidance},author={Li, Ying and Lv, Chengfei and Wang, Huan},booktitle={Advances in Neural Information Processing Systems (NeurIPS)},year={2025},}
CVPR
DyCoke: Dynamic Compression of Tokens for Fast Video Large Language Models
@inproceedings{tao2025dycoke,title={DyCoke: Dynamic Compression of Tokens for Fast Video Large Language Models},author={Tao, Keda and Qin, Can and You, Haoxuan and Sui, Yang and Wang, Huan},booktitle={IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},year={2025},}
ICCV
On-Device Diffusion Transformer Policy for Efficient Robot Manipulation
@inproceedings{wu2025ondevice,title={On-Device Diffusion Transformer Policy for Efficient Robot Manipulation},author={Wu, Yiming and Wang, Huan and Chen, Zhenghao and Pang, Jianxin and Xu, Dong},booktitle={IEEE/CVF International Conference on Computer Vision (ICCV)},year={2025},}
TCSVT
Niagara: Normal-Integrated Geometric Affine Field for Scene Reconstruction from a Single View
@article{wu2025niagara,title={Niagara: Normal-Integrated Geometric Affine Field for Scene Reconstruction from a Single View},author={Wu, Xianzu and Ai, Zhenxin and Yang, Harry and Lim, Ser-Nam and Liu, Jun and Wang, Huan},journal={IEEE Transactions on Circuits and Systems for Video Technology},year={2025},}
arXiv
Active Perception Agent for Omnimodal Audio-Video Understanding
Keda Tao, Wenjie Du, Bohan Yu, Weiqiang Wang, Jian Liu, and Huan Wang
@article{tao2025omniagent,title={Active Perception Agent for Omnimodal Audio-Video Understanding},author={Tao, Keda and Du, Wenjie and Yu, Bohan and Wang, Weiqiang and Liu, Jian and Wang, Huan},journal={arXiv preprint arXiv:2512.23646},year={2025},}
arXiv
Which Heads Matter for Reasoning? RL-Guided KV Cache Compression
@article{du2025rlkv,title={Which Heads Matter for Reasoning? RL-Guided KV Cache Compression},author={Du, Wenjie and Jiang, Li and Tao, Keda and Liu, Xue and Wang, Huan},journal={arXiv preprint arXiv:2510.08525},year={2025},}
arXiv
ConCuR: Conciseness Makes State-of-the-Art Kernel Generation
Lingcheng Kong, Jiateng Wei, Hanzhang Shen, and Huan Wang
@article{kong2025concur,title={ConCuR: Conciseness Makes State-of-the-Art Kernel Generation},author={Kong, Lingcheng and Wei, Jiateng and Shen, Hanzhang and Wang, Huan},journal={arXiv preprint arXiv:2510.07356},year={2025},}
arXiv
SparseSSM: Efficient Selective Structured State Space Models Can Be Pruned in One-Shot
@article{tuo2025sparsessm,title={SparseSSM: Efficient Selective Structured State Space Models Can Be Pruned in One-Shot},author={Tuo, Kaiwen and Wang, Huan},journal={arXiv preprint arXiv:2506.09613},year={2025},}
arXiv
Plug-and-Play 1.x-Bit KV Cache Quantization for Video Large Language Models
@article{tao2025plugandplay,title={Plug-and-Play 1.x-Bit KV Cache Quantization for Video Large Language Models},author={Tao, Keda and You, Haoxuan and Sui, Yang and Qin, Can and Wang, Huan},journal={arXiv preprint arXiv:2503.16257},year={2025},}
@article{feng2024oracle,title={Is Oracle Pruning the True Oracle?},author={Feng, Sicheng and Tao, Keda and Wang, Huan},journal={arXiv preprint arXiv:2412.00143},year={2024},}