其他
我逆向了GitHub Copilot,这是代码实现
👉导读
众所周知,Github Copilot 是一种基于机器学习的代码自动补全工具。它使用了来自 GitHub 的大量代码作为训练数据,并使用 OpenAI 的语言模型来生成代码。Copilot 还可以学习用户的编码习惯,并根据上下文推断出正确的代码片段。在实际使用中发现大部份提示还是非常好用的,能够较为准确的推测出用户意图,甚至是基于项目其他文件的上下文进行推理。比较好奇这里是怎么做到的,于是探索了这个 VSCode 插件的详细实现。👉目录
1 准备工作2 入口分析3 代码提示入口逻辑4 getGhostText 核心逻辑5 Extract 核心逻辑6 getPrompt 核心逻辑7 抓包实验一下8 小结01
1.1 分割 webpack_modules
const ast = parser.parse(source);
function parseModules() {
traverse(ast, {
enter(path) {
if (
path.node.type === "VariableDeclarator" &&
path.node.id.name === "__webpack_modules__"
) {
const modules = path.node.init.properties;
for (const module of modules) {
const moduleId = module.key.value;
const moduleAst = module.value;
const moduleSource = generate(moduleAst).code;
try {
const ast = transformRequire(prettier(clearfyParams(moduleId, moduleSource)));
const mainBody = ast.program.body[0].expression.body.body;
const moduleCode = generate(types.Program(mainBody)).code;
fs.writeFileSync(
"./prettier/modules/" + moduleId + ".js",
moduleCode,
"utf8"
);
} catch (e) {
console.log(e);
}
}
}
},
});
}
1.2 识别模块依赖
function clearfyParams(moduleId, moduleSource) {
if (moduleSource.trim().startsWith("function")) {
// change `function(e, t, n) {` to `(e, t, n) => {`
moduleSource = moduleSource.replace("function", "");
moduleSource = moduleSource.replace(")", ") =>");
}
const moduleAst = parser.parse(moduleSource);
let flag = false;
traverse(moduleAst, {
ArrowFunctionExpression(path) {
if (flag) return;
const params = path.node.params;
params.forEach((param) => {
if (param.name === "e" || param.name === "t" || param.name === "n") {
path.scope.rename(
param.name,
{
e: "module",
t: "exports",
n: "require",
}[param.name]
);
}
});
flag = true;
},
});
return moduleAst;
}
var r = require(12781).Stream;
var i = require(73837);
function o() {
this.source = null;
this.dataSize = 0;
this.maxDataSize = 1048576;
this.pauseStream = !0;
this._maxDataSizeExceeded = !1;
this._released = !1;
this._bufferedEvents = [];
}
module.exports = o;
1.3 优化压缩后的语法
function prettier(ast) {
const moduleTransformer = {
// e.g., `(0, r.getConfig)(e, r.ConfigKey.DebugOverrideProxyUrl);`
// gets transformed to r.getConfig(e, r.ConfigKey.DebugOverrideProxyUrl);
CallExpression(path) {
if (path.node.callee.type != "SequenceExpression") {
return;
}
if (
path.node.callee.expressions.length == 2 &&
path.node.callee.expressions[0].type == "NumericLiteral"
) {
path.node.callee = path.node.callee.expressions[1];
}
},
ExpressionStatement(path) {
if (path.node.expression.type == "SequenceExpression") {
const exprs = path.node.expression.expressions;
let exprStmts = exprs.map((e) => {
return types.expressionStatement(e);
});
path.replaceWithMultiple(exprStmts);
return;
}
if (path.node.expression.type == "AssignmentExpression") {
// handle cases like: `a = (expr1, expr2, expr3)`
// convert to: `expr1; expr2; a = expr3;`
if (path.node.expression.right.type == "SequenceExpression") {
const exprs = path.node.expression.right.expressions;
let exprStmts = exprs.map((e) => {
return types.expressionStatement(e);
});
let lastExpr = exprStmts.pop();
path.node.expression.right = lastExpr.expression;
exprStmts.push(path.node);
path.replaceWithMultiple(exprStmts);
return;
}
// handle cases like: `exports.GoodExplainableName = a;` where `a` is a function or a class
// rename `a` to `GoodExplainableName` everywhere in the module
if (
path.node.expression.left.type == "MemberExpression" &&
path.node.expression.left.object.type == "Identifier" &&
path.node.expression.left.object.name == "exports" &&
path.node.expression.left.property.type == "Identifier" &&
path.node.expression.left.property.name != "default" &&
path.node.expression.right.type == "Identifier" &&
path.node.expression.right.name.length == 1
) {
path.scope.rename(
path.node.expression.right.name,
path.node.expression.left.property.name
);
return;
}
}
if (path.node.expression.type == "ConditionalExpression") {
// handle cases like: `<test> ? c : d;`
// convert to: `if (<test>) { c; } else { d; }`
const test = path.node.expression.test;
const consequent = path.node.expression.consequent;
const alternate = path.node.expression.alternate;
const ifStmt = types.ifStatement(
test,
types.blockStatement([types.expressionStatement(consequent)]),
types.blockStatement([types.expressionStatement(alternate)])
);
path.replaceWith(ifStmt);
return;
}
if (path.node.expression.type == "LogicalExpression") {
// handle cases like: `a && b;`
// convert to: `if (a) { b; }`
const test = path.node.expression.left;
const consequent = path.node.expression.right;
const ifStmt = types.ifStatement(
test,
types.blockStatement([types.expressionStatement(consequent)]),
null
);
path.replaceWith(ifStmt);
return;
}
},
IfStatement(path) {
if (!path.node.test || path.node.test.type != "SequenceExpression") {
return;
}
const exprs = path.node.test.expressions;
let exprStmts = exprs.map((e) => {
return types.expressionStatement(e);
});
let lastExpr = exprStmts.pop();
path.node.test = lastExpr.expression;
exprStmts.push(path.node);
path.replaceWithMultiple(exprStmts);
},
ReturnStatement(path) {
if (
!path.node.argument ||
path.node.argument.type != "SequenceExpression"
) {
return;
}
const exprs = path.node.argument.expressions;
let exprStmts = exprs.map((e) => {
return types.expressionStatement(e);
});
let lastExpr = exprStmts.pop();
let returnStmt = types.returnStatement(lastExpr.expression);
exprStmts.push(returnStmt);
path.replaceWithMultiple(exprStmts);
},
VariableDeclaration(path) {
// change `const a = 1, b = 2;` to `const a = 1; const b = 2;`
if (path.node.declarations.length > 1) {
let newDecls = path.node.declarations.map((d) => {
return types.variableDeclaration(path.node.kind, [d]);
});
path.replaceWithMultiple(newDecls);
}
},
};
traverse(ast, moduleTransformer);
return ast;
}
1.4 require 的模块 id 取名
function transformRequire(ast) {
const moduleTransformer = {
VariableDeclaration(path) {
if (path.node.declarations[0].init && path.node.declarations[0].init.type === "CallExpression") {
if (path.node.declarations[0].init.callee.name === "require") {
const moduleId = path.node.declarations[0].init.arguments[0].value;
if (NameMap[moduleId]) {
const { name, path: modulePath} = NameMap[moduleId];
path.node.declarations[0].init.arguments[0].value = '"'+modulePath+'"';
path.scope.rename(path.node.declarations[0].id.name, name);
}
}
}
},
};
traverse(ast, moduleTransformer);
return ast;
}
02
03
如果用户关闭了 InlineSuggestEnable、或者 document 不在处理白名单内,或者用户取消了输入,都会提前 return,不进行代码提示。
调用 getGhostText 方法拿到 texts,这个大概就是最终会返回给用户的代码提示文本。
调用 completionsFromGhostTextResults,拿到最终的 completions。这个函数比较简单,主要对文本进行了一些格式化的处理,比如处理 Tab 空格的问题,以及根据光标当前的位置计算出代码提示应当显示在编辑器中的坐标范围。
04
4.1 提取 Prompt
const prompt = await extractprompt.extractPrompt(ctx, document, position);
4.2 边界判断
if ("copilotNotAvailable" === prompt.type) {
exports.ghostTextLogger.debug(
ctx,
"Copilot not available, due to the .copilotignore settings"
);
return {
type: "abortedBeforeIssued",
reason: "Copilot not available due to the .copilotignore settings",
};
}
if ("contextTooShort" === prompt.type) {
exports.ghostTextLogger.debug(ctx, "Breaking, not enough context");
return {
type: "abortedBeforeIssued",
reason: "Not enough context",
};
}
if (token?.isCancellationRequested) {
exports.ghostTextLogger.info(ctx, "Cancelled after extractPrompt");
return {
type: "abortedBeforeIssued",
reason: "Cancelled after extractPrompt",
};
}
包含在 .copilotignore 里的文件;
上下文太少了;
用户已经取消了。
4.3 二级判断
function updateGlobalCacheKey(prefix, suffix, promptKey) {
prefixCache = prefix;
suffixCache = suffix;
promptKeyCache = promptKey;
}
exports.completionCache = new s.LRUCacheMap(100);
exports.keyForPrompt = function (e) {
return r.SHA256(e.prefix + e.suffix).toString();
};
4.4 真正发起请求
设置 Debounce 时延;
判断 contexualFilterScore 是否达到阈值。
exports.getDebounceLimit = async function (e, t) {
let n;
if ((await e.get(r.Features).debouncePredict()) && t.measurements.contextualFilterScore) {
const e = t.measurements.contextualFilterScore;
const r = .3475;
const i = 7;
n = 25 + 250 / (1 + Math.pow(e / r, i));
} else n = await e.get(r.Features).debounceMs();
return n > 0 ? n : 75;
};
4.5 流程总结
05
async fetchExperiments(e, t) {
const n = e.get(r.Fetcher);
let o;
try {
o = await n.fetch("https://default.exp-tas.com/vscode/ab", {
method: "GET",
headers: t
});
} catch (t) {
return i.ExpConfig.createFallbackConfig(e, `Error fetching ExP config: ${t}`);
}
if (!o.ok) return i.ExpConfig.createFallbackConfig(e, `ExP responded with ${o.status}`);
const s = await o.json(),
a = s.Configs.find(e => "vscode" === e.Id) ?? {
Id: "vscode",
Parameters: {}
},
c = Object.entries(a.Parameters).map(([e, t]) => e + (t ? "" : "cf"));
return new i.ExpConfig(a.Parameters, s.AssignmentContext, c.join(";"));
}
suffixPercent,默认值为15; fimSuffixLengthThreshold,默认值为0; maxPromptCompletionTokens,默认值为2048; neighboringTabsOption,默认值为 eager; neighboringSnippetTypes,默认值为 NeighboringSnippets; numberOfSnippets,默认值为4; snippetPercent,默认值为0; suffixStartMode,默认值为 CursorTrimStart; tokenizerName, 默认值为 cushman002; indentationMinLength,默认值为 undefined; indentationMaxLength,默认值为 undefined;
cursorContextFix,默认值为 false;
06
6.1 一些额外的配置字段
languageMarker,默认值为 Top; pathMarker,默认值为 Top; localImportContext,默认值为 Declarations; snippetPosition,默认值为 TopOfText; lineEnding,默认值为 ConvertToUnix; suffixMatchThreshold,默认值为0; suffixMatchCriteria,默认值为 Levenshtein; cursorSnippetsPickingStrategy,默认值为 CursorJaccard。
6.2 prompt 的组成
BeforeCursor,是光标前的内容; AfterCursor,是光标后的内容; SimilarFile,与当前文件相似度较高的内容; ImportedFile:import 依赖;
LanguageMarkder,文件开头的标记语法;
PathMarker,文件的路径信息。
6.3 PromptElement 的优先级
class Priorities {
constructor() {
this.registeredPriorities = [0, 1];
}
register(e) {
if (e > Priorities.TOP || e < Priorities.BOTTOM) throw new Error("Priority must be between 0 and 1");
this.registeredPriorities.push(e);
return e;
}
justAbove(...e) {
const t = Math.max(...e);
const n = Math.min(...this.registeredPriorities.filter(e => e > t));
return this.register((n + t) / 2);
}
justBelow(...e) {
const t = Math.min(...e);
const n = Math.max(...this.registeredPriorities.filter(e => e < t));
return this.register((n + t) / 2);
}
between(e, t) {
if (this.registeredPriorities.some(n => n > e && n < t) || !this.registeredPriorities.includes(e) || !this.registeredPriorities.includes(t)) throw new Error("Priorities must be adjacent in the list of priorities");
return this.register((e + t) / 2);
}
}
const beforeCursorPriority = priorities.justBelow(p.Priorities.TOP);
const languageMarkerPriority =
promptOpts.languageMarker === h.Always
? priorities.justBelow(p.Priorities.TOP)
: priorities.justBelow(beforeCursorPriority);
const pathMarkerPriority =
promptOpts.pathMarker === f.Always ? priorities.justBelow(p.Priorities.TOP) : priorities.justBelow(beforeCursorPriority);
const importedFilePriority = priorities.justBelow(beforeCursorPriority);
const lowSnippetPriority = priorities.justBelow(importedFilePriority);
const highSnippetPriority = priorities.justAbove(beforeCursorPriority);
beforeCursorPriority,为0.5;
languageMarkerPriority,为0.25;
pathMarkderPriority,为0.375;
importedFilePriority,为0.4375;
lowSnippetPriority,为0.40625;
highSnippetPriority,为0.75。
6.4 PromptElement 主要内容
6.4.1 languageMarker 和 pathMarker
if (promptOpts.languageMarker !== h.NoMarker) {
const e = newLineEnded(r.getLanguageMarker(resourceInfo));
languageMarkerId = promptWishlist.append(e, p.PromptElementKind.LanguageMarker, languageMarkerPriority);
}
if (promptOpts.pathMarker !== f.NoMarker) {
const e = newLineEnded(r.getPathMarker(resourceInfo));
if (e.length > 0) {
pathMarkerId = promptWishlist.append(e, p.PromptElementKind.PathMarker, pathMarkerPriority);
}
}
exports.getLanguageMarker = function (e) {
const {
languageId: t
} = e;
return -1 !== n.indexOf(t) || hasLanguageMarker(e) ? "" : t in r ? r[t] : comment(`Language: ${t}`, t);
};
const n = ["php", "plaintext"];
const r = {
html: "<!DOCTYPE html>",
python: "#!/usr/bin/env python3",
ruby: "#!/usr/bin/env ruby",
shellscript: "#!/bin/sh",
yaml: "# YAML data"
};
// Language: ${languageId}
exports.getPathMarker = function (e) {
return e.relativePath ? comment(`Path: ${e.relativePath}`, e.languageId) : "";
};
6.4.2 localImportContext
if (promptOpts.localImportContext !== y.NoContext)
for (const e of await i.extractLocalImportContext(resourceInfo, promptOpts.fs))
promptWishlist.append(newLineEnded(e), p.PromptElementKind.ImportedFile, importedFilePriority);
const reg = /^\s*import\s*(type|)\s*\{[^}]*\}\s*from\s*['"]\./gm;
exports.extractLocalImportContext = async function (resourceInfo, fs) {
let {
source: source,
uri: uri,
languageId: languageId
} = resourceInfo;
return fs && "typescript" === languageId ? async function (source, uri, fs) {
let language = "typescript";
let result = [];
const importEndIndex = function (source) {
let match;
let lastIndex = -1;
reg.lastIndex = -1;
do {
match = reg.exec(source);
if (match) {
lastIndex = reg.lastIndex + match.length;
}
} while (match);
if (-1 === lastIndex) return -1;
const nextNewLine = source.indexOf("\n", lastIndex);
return -1 !== nextNewLine ? nextNewLine : source.length;
}(source);
if (-1 === importEndIndex) return result;
source = source.substring(0, importEndIndex);
let ast = await i.parseTreeSitter(language, source);
try {
for (let node of function (node) {
let t = [];
for (let childNode of node.namedChildren) if ("import_statement" === childNode.type) {
t.push(childNode);
}
return t;
}(ast.rootNode)) {
let filePath = getTSFilePath(uri, node);
if (!filePath) continue;
let namedImports = parseNamedImport(node);
if (0 === namedImports.length) continue;
let exports = await getExports(filePath, language, fs);
for (let e of namedImports) if (exports.has(e.name)) {
result.push(...exports.get(e.name));
}
}
} finally {
ast.delete();
}
return result;
}(source, uri, fs) : [];
}
6.4.3 snippets
const snippets = [
...retrievalSnippets,
...(promptOpts.neighboringTabs === a.NeighboringTabsOption.None || 0 === neighborDocs.length
? []
: await a.getNeighborSnippets(
resourceInfo,
neighborDocs,
promptOpts.neighboringSnippetTypes,
promptOpts.neighboringTabs,
promptOpts.cursorContextFix,
promptOpts.indentationMinLength,
promptOpts.indentationMaxLength,
promptOpts.snippetSelection,
promptOpts.snippetSelectionK,
lineCursorHistory,
promptOpts.cursorSnippetsPickingStrategy
)),
];
let neighborDocs = [];
let neighborSource = new Map();
try {
const t = await u.NeighborSource.getNeighborFiles(ctx, uri, repoUserData);
neighborDocs = t.docs;
neighborSource = t.neighborSource;
} catch (t) {
telemetry.telemetryException(ctx, t, "prompt.getPromptForSource.exception");
}
exports.OpenTabFiles = class {
constructor(e, t) {
this.docManager = e;
this.neighboringLanguageType = t;
}
async truncateDocs(e, t, n, r) {
const o = [];
let s = 0;
for (const a of e) if (!(s + a.getText().length > i.NeighborSource.MAX_NEIGHBOR_AGGREGATE_LENGTH) && ("file" == a.uri.scheme && a.fileName !== t && i.considerNeighborFile(n, a.languageId, this.neighboringLanguageType) && (o.push({
uri: a.uri.toString(),
relativePath: await this.docManager.getRelativePath(a),
languageId: a.languageId,
source: a.getText()
}), s += a.getText().length), o.length >= r)) break;
return o;
}
async getNeighborFiles(e, t, n, o) {
let s = [];
const a = new Map();
s = await this.truncateDocs(utils2.sortByAccessTimes(this.docManager.textDocuments), e.fsPath, n, o);
a.set(i.NeighboringFileType.OpenTabs, s.map(e => e.uri));
return {
docs: s,
neighborSource: a
};
}
};
exports.getNeighborSnippets = async function (
resourceInfo,
neighborDocs,
neighboringSnippetTypes,
neighboringTabs,
cursorContextFix,
indentationMinLength,
indentationMaxLength,
snippetSelection,
snippetSelectionK,
lineCursorHistory,
cursorSnippetsPickingStrategy
) {
const options = {
...exports.neighborOptionToSelection[neighboringTabs],
};
const y = (function (
resourceInfo,
neighboringSnippetTypes,
options,
cursorContextFix,
indentationMinLength,
indentationMaxLength,
lineCursorHistory,
cursorSnippetsPickingStrategy = i.CursorSnippetsPickingStrategy
.CursorJaccard
) {
let d;
if (neighboringSnippetTypes === s.NeighboringSnippets) {
d =
void 0 !== indentationMinLength && void 0 !== indentationMaxLength
? o.IndentationBasedJaccardMatcher.FACTORY(
indentationMinLength,
indentationMaxLength,
cursorContextFix
)
: o.FixedWindowSizeJaccardMatcher.FACTORY(
options.snippetLength,
cursorContextFix
);
} else {
if (neighboringSnippetTypes === s.NeighboringFunctions) {
d = o.FunctionJaccardMatcher.FACTORY(
options.snippetLength,
cursorContextFix,
indentationMinLength,
indentationMaxLength
);
} else {
r.ok(
void 0 !== lineCursorHistory,
"lineCursorHistory should not be undefined"
);
d = i.CursorHistoryMatcher.FACTORY(
options.snippetLength,
lineCursorHistory,
cursorSnippetsPickingStrategy,
cursorContextFix
);
}
}
return d.to(resourceInfo);
})(
resourceInfo,
neighboringSnippetTypes,
options,
cursorContextFix,
indentationMinLength,
indentationMaxLength,
lineCursorHistory,
cursorSnippetsPickingStrategy
);
return 0 === options.numberOfSnippets
? []
: (
await neighborDocs
.filter((e) => e.source.length < 1e4 && e.source.length > 0)
.slice(0, 20)
.reduce(
async (e, t) =>
(
await e
).concat(
(
await y.findMatches(t, snippetSelection, snippetSelectionK)
).map((e) => ({
relativePath: t.relativePath,
...e,
}))
),
Promise.resolve([])
)
)
.filter((e) => e.score && e.snippet && e.score > options.threshold)
.sort((e, t) => e.score - t.score)
.slice(-options.numberOfSnippets);
};
neighboringSnippetTypes 默认为 NeighboringSnippets,所以会走到 FixedWindowSizeJaccardMatcher 的逻辑里;
返回值是根据 neighborDocs 的内容,过滤掉过小和过大文件,经过 findMatchers 拿到的结果;
最后过滤掉 score 较低的,不过 threshold 默认为0,所以这里应该保留了所有的内容;
根据 score 进行排序,选取较大的4条(numberOfSnippets 默认为4)。
class FixedWindowSizeJaccardMatcher extends i.WindowedMatcher {
constructor(resourceInfo, snippetLength, cursorContextFix) {
super(resourceInfo, cursorContextFix);
this.windowLength = snippetLength;
this.cursorContextFix = cursorContextFix;
}
id() {
return "fixed:" + this.windowLength;
}
getWindowsDelineations(e) {
return o.getBasicWindowDelineations(this.windowLength, e);
}
trimDocument(e) {
return e.source.slice(0, e.offset).split("\n").slice(-this.windowLength).join("\n");
}
_getCursorContextInfo(e) {
return r.getCursorContext(e, {
maxLineCount: this.windowLength,
cursorContextFix: this.cursorContextFix
});
}
similarityScore(e, t) {
return computeScore(e, t);
}
}
async findMatches(e, t = i.SnippetSelectionOption.BestMatch, n) {
if (t == i.SnippetSelectionOption.BestMatch) {
const t = await this.findBestMatch(e);
return t ? [t] : [];
}
return t == i.SnippetSelectionOption.TopK && (await this.findTopKMatches(e, n)) || [];
}
async findBestMatch(e) {
if (0 === e.source.length || 0 === this.referenceTokens.size) return;
const t = e.source.split("\n");
const n = this.retrieveAllSnippets(e, s.Descending);
return 0 !== n.length && 0 !== n[0].score ? {
snippet: t.slice(n[0].startLine, n[0].endLine).join("\n"),
semantics: o.SnippetSemantics.Snippet,
provider: o.SnippetProvider.NeighboringTabs,
...n[0]
} : void 0;
}
retrieveAllSnippets(e, t = s.Descending) {
const n = [];
if (0 === e.source.length || 0 === this.referenceTokens.size) return n;
const sourceArr = e.source.split("\n");
const key = this.id() + ":" + e.source;
const result = c.get(key) ?? [];
const noCache = 0 == result.length;
const tokens = noCache ? sourceArr.map(this.tokenizer.tokenize, this.tokenizer) : [];
for (const [index, [startLine, endLine]] of this.getWindowsDelineations(sourceArr).entries()) {
if (noCache) {
const e = new Set();
tokens.slice(startLine, endLine).forEach(t => t.forEach(e.add, e));
result.push(e);
}
const r = result[index];
const s = this.similarityScore(r, this.referenceTokens);
n.push({
score: s,
startLine: startLine,
endLine: endLine
});
}
if (noCache) {
c.put(key, result);
}
return this.sortScoredSnippets(n, t);
}
经过 tokenize 获取到当前代码片段每一行的 token; 通过 getWindowsDelineations 将代码分割成不同的小窗口(步长为1); 每个窗口的 token 和当前文件(referenceDoc)的 token 做一次相似度计算(Jaccard 相似度);
retrieveAllSnippets(e, t = s.Descending) {
const n = [];
if (0 === e.source.length || 0 === this.referenceTokens.size) return n;
const sourceArr = e.source.split("\n");
const key = this.id() + ":" + e.source;
const result = c.get(key) ?? [];
const noCache = 0 == result.length;
const tokens = noCache ? sourceArr.map(this.tokenizer.tokenize, this.tokenizer) : [];
for (const [index, [startLine, endLine]] of this.getWindowsDelineations(sourceArr).entries()) {
if (noCache) {
const e = new Set();
tokens.slice(startLine, endLine).forEach(t => t.forEach(e.add, e));
result.push(e);
}
const r = result[index];
const s = this.similarityScore(r, this.referenceTokens);
n.push({
score: s,
startLine: startLine,
endLine: endLine
});
}
if (noCache) {
c.put(key, result);
}
return this.sortScoredSnippets(n, t);
}
exports.getBasicWindowDelineations = function (e, t) {
const n = [];
const r = t.length;
if (0 == r) return [];
if (r < e) return [[0, r]];
for (let t = 0; t < r - e + 1; t++) n.push([t, t + e]);
return n;
};
get referenceTokens() {
if (void 0 === this._referenceTokens) {
this._referenceTokens = this.tokenizer.tokenize(this._getCursorContextInfo(this.referenceDoc).context);
}
return this._referenceTokens;
}
exports.getCursorContext = function e(doc, opts = {}) {
const opts = function (e) {
return {
...i,
...e
};
}(opts);
const s = r.getTokenizer(opts.tokenizerName);
if (void 0 === opts.maxTokenLength && void 0 !== opts.maxLineCount) {
const e = doc.source.slice(0, doc.offset).split("\n").slice(-opts.maxLineCount);
const n = e.join("\n");
return {
context: n,
lineCount: e.length,
tokenLength: s.tokenLength(n),
tokenizerName: opts.tokenizerName
};
}
// ...
};
function computeScore(e, t) {
const n = new Set();
e.forEach(e => {
if (t.has(e)) {
n.add(e);
}
});
return n.size / (e.size + t.size - n.size);
}
function $() {
const maxSnippetLength = Math.round((promptOpts.snippetPercent / 100) * promptOpts.maxPromptLength);
c.processSnippetsForWishlist(
snippets,
resourceInfo.languageId,
tokenizer,
promptOpts.snippetProviderOptions,
{
priorities: priorities,
low: lowSnippetPriority,
high: highSnippetPriority,
},
promptOpts.numberOfSnippets,
maxSnippetLength
).forEach((e) => {
let t = p.PromptElementKind.SimilarFile;
if (e.provider === c.SnippetProvider.Retrieval) {
t = p.PromptElementKind.RetrievalSnippet;
} else {
if (e.provider == c.SnippetProvider.SymbolDef) {
t = p.PromptElementKind.SymbolDefinition;
}
}
promptWishlist.append(e.announcedSnippet, t, e.priority, e.tokens, e.normalizedScore);
});
}
exports.processSnippetsForWishlist = function (snippets, languageId, tokenizer, snippetProviderOptions, priorities, numberOfSnippets, maxSnippetLength) {
const {
reserved: reserved,
candidates: candidates
} = selectSnippets(snippets, numberOfSnippets, snippetProviderOptions);
let d = 0;
let h = [];
let highPriorities = priorities.high;
let lowPriorities = priorities.low;
function g(snippet, r) {
const o = announceSnippet(snippet, languageId);
const c = tokenizer.tokenLength(o);
let l;
if (r + c <= maxSnippetLength) {
l = highPriorities;
highPriorities = priorities.priorities.justBelow(l);
} else {
l = lowPriorities;
lowPriorities = priorities.priorities.justBelow(l);
}
h.push({
announcedSnippet: o,
provider: snippet.provider,
providerScore: snippet.providerScore,
normalizedScore: snippet.normalizedScore,
priority: l,
tokens: c,
relativePath: snippet.relativePath
});
return r + c;
}
for (const snippet of [...reserved, ...candidates]) {
if (h.length >= numberOfSnippets) break;
d = g(snippete, d);
}
l(h);
h.reverse();
return h;
};
function announceSnippet(e, t) {
const n = s[e.semantics];
let i = (e.relativePath ? `Compare this ${n} from ${e.relativePath}:` : `Compare this ${n}:`) + "\n" + e.snippet;
if (i.endsWith("\n")) {
i += "\n";
}
return r.commentBlockAsSingles(i, t);
}
promptWishlist.appendLineForLine(source.substring(0, offset), p.PromptElementKind.BeforeCursor, beforeCursorPriority).forEach((e) =>
V.push(e)
);
appendLineForLine(text, kind, priority) {
const lineArr = (lines = this.convertLineEndings(text)).split("\n");
for (let i = 0; i < lineArr.length - 1; i++) lineArr[i] += "\n";
const lines = [];
lineArr.forEach((line) => {
if ("\n" === line && lines.length > 0 && !lines[lines.length - 1].endsWith("\n\n")) {
lines[lines.length - 1] += "\n";
} else {
lines.push(line);
}
});
const result = [];
lines.forEach((text, index) => {
if ("" !== text) {
result.push(this.append(text, kind, priority));
if (index > 0) {
this.content[this.content.length - 2].requires = [
this.content[this.content.length - 1],
];
}
}
});
return result;
}
6.5 wishList 的 fullfill 整合处理
if (0 === promptOpts.suffixPercent || q.length <= promptOpts.fimSuffixLengthThreshold)
return promptWishlist.fulfill(promptOpts.maxPromptLength);
fulfill(maxPromptLength) {
const promptChoices = new PromptChoices();
const promptBackground = new PromptBackground();
const elements = this.content.map((e, t) => ({
element: e,
index: t,
}));
elements.sort((e, t) =>
e.element.priority === t.element.priority
? t.index - e.index
: t.element.priority - e.element.priority
);
const requires = new Set();
const excludes = new Set();
let lastElement;
const results = [];
let promptLength = maxPromptLength;
elements.forEach((e) => {
const element = e.element;
const index = e.index;
if (
promptLength >= 0 &&
(promptLength > 0 || void 0 === lastElement) &&
element.requires.every((e) => requires.has(e.id)) &&
!excludes.has(r.id)
) {
let tokens = element.tokens;
const nextElement = (function (e, t) {
let n;
let r = 1 / 0;
for (const i of e)
if (i.index > t && i.index < r) {
n = i;
r = i.index;
}
return n;
})(results, index)?.element;
if (element.text.endsWith("\n\n") && nextElement && !nextElement.text.match(/^\s/)) {
tokens++;
}
if (promptLength >= tokens) {
promptLength -= tokens;
requires.add(r.id);
element.excludes.forEach((e) => excludes.add(e.id));
promptChoices.markUsed(element);
promptBackground.markUsed(element);
results.push(e);
} else {
if (void 0 === lastElement) {
lastElement = e;
} else {
promptChoices.markUnused(e.element);
promptBackground.markUnused(e.element);
}
}
} else {
promptChoices.markUnused(element);
promptBackground.markUnused(element);
}
});
results.sort((e, t) => e.index - t.index);
let prefix = results.reduce((e, t) => e + t.element.text, "");
let prefixLength = this.tokenizer.tokenLength(prefix);
for (; prefixLength > maxPromptLength; ) {
u.sort((e, t) =>
t.element.priority === e.element.priority
? t.index - e.index
: t.element.priority - e.element.priority
);
const e = u.pop();
if (e) {
promptChoices.undoMarkUsed(e.element);
promptChoices.markUnused(e.element);
promptBackground.undoMarkUsed(e.element);
promptBackground.markUnused(e.element);
if (void 0 !== lastElement) {
promptChoices.markUnused(lastElement.element);
promptBackground.markUnused(lastElement.element);
}
lastElement = void 0;
}
u.sort((e, t) => e.index - t.index);
prefix = u.reduce((e, t) => e + t.element.text, "");
prefixLength = this.tokenizer.tokenLength(prefix);
}
const f = [...u];
if (void 0 !== lastElement) {
f.push(lastElement);
f.sort((e, t) => e.index - t.index);
const prefix = f.reduce((e, t) => e + t.element.text, "");
const prefixLength = this.tokenizer.tokenLength(prefix);
if (prefixLength <= maxPromptLength) {
promptChoices.markUsed(l.element);
promptBackground.markUsed(l.element);
const promptElementRanges = new PromptElementRanges(f);
return {
prefix: prefix,
suffix: "",
prefixLength: prefixLength,
suffixLength: 0,
promptChoices: promptChoices,
promptBackground: promptBackground,
promptElementRanges: promptElementRanges,
};
}
promptChoices.markUnused(l.element);
promptBackground.markUnused(l.element);
}
const m = new PromptElementRanges(u);
return {
prefix: prefix,
suffix: "",
prefixLength: prefixLength,
suffixLength: 0,
promptChoices: promptChoices,
promptBackground: promptBackground,
promptElementRanges: m,
};
}
首先按照 Priority 排序(Priority 相同按 index),处理文本内容,这就意味着,在有限的 Token 下,Priority 越高的文本越能被保障。
输出的时候,是按照 index 排序的,也就是说 Priority 只用作处理文本的优先级,最终组合的 prefix 文本的顺序是按照插入 wishList 的先后顺序的。
languageMarker; pathMarkder; importedFile; Snippet;
beforeCursor。
beforeCursor;
importedFile;
Snippet;
pathMarkder;
languageMarker。
6.6 Prompt 组成的图示
07
08
对于编辑器输入的边界判断,包括太少、太多、取消等等很多场景齐全的考虑;
缓存思想,利用多级缓存策略保护后台,模型运算本身就是一件昂贵的事情;
prompt 的设计,不仅仅包含了上下文代码,在文件解析、编辑器打开的相关代码上还做了很多;
利用简单的 Jaccard 算法计算分词后的文本相似度,能够快速决策出当前上下文相关的 snippet;
实验特性,在 Copilot 中,大量的参数、优先级、设置字段都是通过实验来控制的,有一套完整的监控上报体系,帮助 Copilot 去调整这些参数,以达到更好的效果。