|
| 1 | +#!/bin/bash |
| 2 | +# 📊 Token Counter for Copilot Instruction Files |
| 3 | +# Uses tiktoken (OpenAI's tokenizer) for accurate counting |
| 4 | +# Approximation: ~0.75 words per token (English text) |
| 5 | + |
| 6 | +set -e |
| 7 | + |
| 8 | +echo "📊 Token Analysis for Copilot Instructions" |
| 9 | +echo "==========================================" |
| 10 | +echo "" |
| 11 | + |
| 12 | +# Check if tiktoken is available |
| 13 | +if command -v python3 &> /dev/null; then |
| 14 | + # Try to use tiktoken for accurate counting |
| 15 | + if python3 -c "import tiktoken" 2>/dev/null; then |
| 16 | + echo "✅ Using tiktoken (accurate Claude/GPT tokenization)" |
| 17 | + echo "" |
| 18 | + else |
| 19 | + # tiktoken not found - offer to install |
| 20 | + echo "⚠️ tiktoken not installed" |
| 21 | + echo "" |
| 22 | + |
| 23 | + # Detect non-interactive environment (CI/CD) |
| 24 | + if [ ! -t 0 ] || [ -n "$CI" ] || [ -n "$CI_CD" ]; then |
| 25 | + echo "🤖 Non-interactive environment detected (CI/CD)" |
| 26 | + echo "📝 Using word-based approximation" |
| 27 | + echo " (To auto-install in CI, set AUTO_INSTALL_TIKTOKEN=1)" |
| 28 | + echo "" |
| 29 | + USE_APPROX=1 |
| 30 | + elif [ -n "$AUTO_INSTALL_TIKTOKEN" ]; then |
| 31 | + echo "📥 Installing tiktoken (AUTO_INSTALL_TIKTOKEN=1)..." |
| 32 | + if pip3 install tiktoken --quiet; then |
| 33 | + echo "✅ tiktoken installed successfully!" |
| 34 | + echo "" |
| 35 | + # Re-run the script after installation |
| 36 | + exec "$0" "$@" |
| 37 | + else |
| 38 | + echo "❌ Installation failed. Using word-based approximation instead." |
| 39 | + echo "" |
| 40 | + USE_APPROX=1 |
| 41 | + fi |
| 42 | + else |
| 43 | + echo "tiktoken provides accurate token counting for Claude/GPT models." |
| 44 | + read -p "📦 Install tiktoken now? (y/n): " -n 1 -r |
| 45 | + echo "" |
| 46 | + if [[ $REPLY =~ ^[Yy]$ ]]; then |
| 47 | + echo "📥 Installing tiktoken..." |
| 48 | + if pip3 install tiktoken --quiet; then |
| 49 | + echo "✅ tiktoken installed successfully!" |
| 50 | + echo "" |
| 51 | + # Re-run the script after installation |
| 52 | + exec "$0" "$@" |
| 53 | + else |
| 54 | + echo "❌ Installation failed. Using word-based approximation instead." |
| 55 | + echo "" |
| 56 | + USE_APPROX=1 |
| 57 | + fi |
| 58 | + else |
| 59 | + echo "📝 Using word-based approximation instead" |
| 60 | + echo " (Install manually: pip3 install tiktoken)" |
| 61 | + echo "" |
| 62 | + USE_APPROX=1 |
| 63 | + fi |
| 64 | + fi |
| 65 | + fi |
| 66 | + |
| 67 | + # Only run tiktoken if it's available and we didn't set USE_APPROX |
| 68 | + if [ -z "$USE_APPROX" ] && python3 -c "import tiktoken" 2>/dev/null; then |
| 69 | + |
| 70 | + # Create temporary Python script |
| 71 | + cat > /tmp/count_tokens.py << 'PYTHON' |
| 72 | +import tiktoken |
| 73 | +import sys |
| 74 | +
|
| 75 | +# cl100k_base is used by GPT-4, Claude uses similar tokenization |
| 76 | +encoding = tiktoken.get_encoding("cl100k_base") |
| 77 | +
|
| 78 | +file_path = sys.argv[1] |
| 79 | +with open(file_path, 'r', encoding='utf-8') as f: |
| 80 | + content = f.read() |
| 81 | +
|
| 82 | +tokens = encoding.encode(content) |
| 83 | +print(len(tokens)) |
| 84 | +PYTHON |
| 85 | + |
| 86 | + # Count tokens for each file |
| 87 | + echo "📄 .github/copilot-instructions.md" |
| 88 | + if [ -f ".github/copilot-instructions.md" ]; then |
| 89 | + COPILOT_TOKENS=$(python3 /tmp/count_tokens.py .github/copilot-instructions.md 2>&1 | grep -v "ERROR:root:code for hash" | tail -1) |
| 90 | + echo " Tokens: $COPILOT_TOKENS" |
| 91 | + else |
| 92 | + echo " ⚠️ File not found, skipping" |
| 93 | + COPILOT_TOKENS=0 |
| 94 | + fi |
| 95 | + echo "" |
| 96 | + |
| 97 | + echo "📄 AGENTS.md" |
| 98 | + if [ -f "AGENTS.md" ]; then |
| 99 | + AGENTS_TOKENS=$(python3 /tmp/count_tokens.py AGENTS.md 2>&1 | grep -v "ERROR:root:code for hash" | tail -1) |
| 100 | + echo " Tokens: $AGENTS_TOKENS" |
| 101 | + else |
| 102 | + echo " ⚠️ File not found, skipping" |
| 103 | + AGENTS_TOKENS=0 |
| 104 | + fi |
| 105 | + echo "" |
| 106 | + |
| 107 | + # Calculate total |
| 108 | + TOTAL=$((COPILOT_TOKENS + AGENTS_TOKENS)) |
| 109 | + echo "📊 Summary" |
| 110 | + echo " Base load (auto): $COPILOT_TOKENS tokens" |
| 111 | + echo " On-demand load: $AGENTS_TOKENS tokens" |
| 112 | + echo " Total (if both): $TOTAL tokens" |
| 113 | + echo "" |
| 114 | + |
| 115 | + # Check against target |
| 116 | + TARGET=600 |
| 117 | + LIMIT=650 |
| 118 | + if [ $COPILOT_TOKENS -le $TARGET ]; then |
| 119 | + echo "✅ copilot-instructions.md within target ($TARGET tokens)" |
| 120 | + elif [ $COPILOT_TOKENS -le $LIMIT ]; then |
| 121 | + echo "⚠️ copilot-instructions.md over target but within limit ($LIMIT tokens)" |
| 122 | + else |
| 123 | + echo "❌ copilot-instructions.md exceeds limit! Optimization required." |
| 124 | + fi |
| 125 | + |
| 126 | + # Calculate savings (guard against division by zero) |
| 127 | + if [ $TOTAL -gt 0 ]; then |
| 128 | + SAVINGS=$((AGENTS_TOKENS * 100 / TOTAL)) |
| 129 | + echo "💡 Savings: ${SAVINGS}% saved when AGENTS.md not needed" |
| 130 | + else |
| 131 | + echo "💡 Savings: 0% (no tokens to count)" |
| 132 | + fi |
| 133 | + |
| 134 | + # Cleanup |
| 135 | + rm /tmp/count_tokens.py |
| 136 | + fi |
| 137 | +else |
| 138 | + echo "❌ Python3 not found" |
| 139 | + echo " Python 3 is required for token counting" |
| 140 | + echo " Install from: https://www.python.org/downloads/" |
| 141 | + echo "" |
| 142 | + exit 1 |
| 143 | +fi |
| 144 | + |
| 145 | +# Fallback: word-based approximation |
| 146 | +if [ -n "$USE_APPROX" ]; then |
| 147 | + echo "📄 .github/copilot-instructions.md" |
| 148 | + WORDS=$(wc -w < .github/copilot-instructions.md | tr -d ' ') |
| 149 | + APPROX_TOKENS=$((WORDS * 4 / 3)) |
| 150 | + echo " Words: $WORDS" |
| 151 | + echo " Approx tokens: $APPROX_TOKENS" |
| 152 | + echo "" |
| 153 | + |
| 154 | + echo "📄 AGENTS.md" |
| 155 | + WORDS=$(wc -w < AGENTS.md | tr -d ' ') |
| 156 | + APPROX_TOKENS=$((WORDS * 4 / 3)) |
| 157 | + echo " Words: $WORDS" |
| 158 | + echo " Approx tokens: $APPROX_TOKENS" |
| 159 | + echo "" |
| 160 | + |
| 161 | + echo "💡 Note: Run script again to install tiktoken for accurate counts" |
| 162 | +fi |
| 163 | + |
| 164 | +echo "" |
| 165 | +echo "==========================================" |
0 commit comments