作为我研究的一部分,这是我在开源网络演示中尝试探索字向量的一个尝试。该功能将与站点分开解释。这也是我第一次以一种不平凡的方式使用HTML、JavaScript和CSS,我正在使用现代的HTML和JS特性(实际上,它只适用于最近的浏览器)。作为一个程序员,我的代码中有几个部分不太合适,在我的经验中,这些部分导致代码不那么干净:
我希望使我的代码更简洁,更易于维护。希望答案能解决我上述的担忧。大多数计算都是事先完成的,所以这里的任何演示函数都不应该出现性能问题。要在本地运行代码,请下载这个预释放。
index.html
Word2Vec Demo
Word2Vec Demo
Age
Gender
Add/Remove Word
Vector analogy arithmetic
-
+
=
Submit
Dimension Name
Dimension Name
Submitstyle.css
h1 {
text-align: center;
}
#plots-wrapper {
/* default width 100% */
height: 70vh;
display: grid;
grid-template-columns: 50% 15% 35%;
}
#scatter-wrapper {
display: flex;
flex-direction: column;
}
#plotly-scatter {
flex-grow: 1;
}
#plots-status-bar {
display: grid;
grid-template-columns: 1fr 1fr 1fr;
}
#modify-word-input {
width: 50%
}
/* bind vector axis click */
.yaxislayer-above {
cursor: pointer;
pointer-events: all;
}
#user-dimension-wrapper {
display: grid;
grid-template-columns: 1fr 1fr;
}
.user-dimension-area {
display: flex;
flex-direction: column;
}
.user-dimension-entry {
display: grid;
grid-template-columns: 1fr 1fr;
margin: 5px;
}
.user-dimension-feature-set {
margin: 5px;
}word2vec.js (主要功能)
"use strict";
const MAGNIFY_WINDOW = 5; // range for magnified view
const HEATMAP_MIN = -0.2; // min and max for heatmap colorscale
const HEATMAP_MAX = 0.2;
// to be used for naming features
let feature1Name;
let feature2Name;
// words to be used for creating dimensions
let feature1Set1, feature1Set2, feature2Set1, feature2Set2;
// Word pairs used to compute features
const FEATURE1_PAIRS =
[
["man", "woman"],
["king", "queen"],
["prince", "princess"],
["husband", "wife"],
["father", "mother"],
["son", "daughter"],
["uncle", "aunt"],
["nephew", "niece"],
["boy", "girl"],
["male", "female"]
];
const FEATURE2_PAIRS =
[
["man", "boy"],
["woman", "girl"],
["king", "prince"],
["queen", "princess"],
["father", "son"],
["mother", "daughter"],
["uncle", "nephew"],
["aunt", "niece"]
];
// Residual words made up from words in gender and age pairs
const RESIDUAL_WORDS = [...new Set(FEATURE1_PAIRS.flat().concat(FEATURE2_PAIRS.flat()))];
// global variables for various plotting functionality
// words plotted on scatter plot
// changes from original demo: replace "refrigerator" with "chair" and "computer"
let scatterWords = ['man', 'woman', 'boy', 'girl', 'king', 'queen', 'prince', 'princess', 'nephew', 'niece',
'uncle', 'aunt', 'father', 'mother', 'son', 'daughter', 'husband', 'wife', 'chair', 'computer'];
// words involved in the computation of analogy in scatter plot (#12)
let analogyScatterWords = [];
// words to show in vector display
let vectorWords = ["queen", "king", "girl", "boy", "woman", "man"];
// selected word in scatterplot (empty string represents nothing selected)
let selectedWord = "";
// saved hoverX for use in magnify view
let hoverX = MAGNIFY_WINDOW;
// main word to vector Map (may include pseudo-word vectors like "man+woman")
let vecs = new Map();
// Set of actual words found in model
let vocab = new Set();
let vecsDim; // word vector dim
let nearestWords; // nearest words Map
// vector calculations and plotting, including residual (issue #3)
let feature1, feature2, residualFeature;
// read raw model text and write to vecs and vocab
function processRawVecs(text) {
const lines = text.trim().split(/\n/);
for (const line of lines) {
const entries = line.trim().split(' ');
vecsDim = entries.length - 1;
const word = entries[0];
const vec = new Vector(entries.slice(1).map(Number)).unit(); // normalize word vectors
vocab.add(word);
vecs.set(word, vec);
}
// sanity check for debugging input data
RESIDUAL_WORDS.forEach(word => console.assert(vecs.has(word),word + " not in vecs"));
}
function processNearestWords(text) {
let nearestWords = new Map();
const lines = text.trim().split(/\n/);
for (const line of lines) {
const entries = line.trim().split(' ');
const target = entries[0];
const words = entries.slice(1);
nearestWords.set(target, words);
}
return nearestWords;
}
// create feature dimension vectors
function createFeature(vecs, wordSet1, wordSet2) {
// for each pair of words, subtract vectors
console.assert(wordSet1.length === wordSet2.length);
const subVecs = wordSet1.map((word1, i) => vecs.get(word1).sub(vecs.get(wordSet2[i])));
// average subtracted vectors into one unit feature vector
return subVecs.reduce((a,b) => a.add(b)).unit();
}
// plot each word on a 3D scatterplot projected onto gender, age, residual features
function plotScatter(newPlot=false) {
// populate feature vectors
feature1 = createFeature(vecs, feature1Set1, feature1Set2);
feature2 = createFeature(vecs, feature2Set1, feature2Set2);
const allFeatureWords = feature1Set1.concat(feature1Set2).concat(feature2Set1).concat(feature2Set2);
const residualWords = [...new Set(allFeatureWords)];
// residual dim calculation described in #3
residualFeature = residualWords.map(word => {
const wordVec = vecs.get(word);
const wordNoFeature1 = wordVec.sub(feature1.scale(wordVec.dot(feature1)));
const wordResidual = wordNoFeature1.sub(feature2.scale(wordNoFeature1.dot(feature2)));
return wordResidual;
}
).reduce((a,b) => a.add(b)).unit(); // average over residual words and normalize
// add features as pseudo-words
// TODO: not hard-code
vecs.set("[age]", feature2);
vecs.set("[gender]", feature1);
// words to actually be plotted (so scatterWords is a little misleading)
const plotWords = [...new Set(scatterWords.concat(analogyScatterWords))];
// x, y, z are simply projections onto features
// use 1 - residual for graphical convention (#3)
const x = plotWords.map(word => 1 - vecs.get(word).dot(residualFeature));
const y = plotWords.map(word => vecs.get(word).dot(feature1));
const z = plotWords.map(word => vecs.get(word).dot(feature2));
// color points by type with priority (#12)
const color = plotWords.map(word =>
(word === selectedWord) ? "#FF0000"
: (word === analogyScatterWords[3]) ? "#FF8888"
: (word === analogyScatterWords[4]) ? "#00FF00"
: (analogyScatterWords.includes(word)) ? "#0000FF"
: "#000000"
);
// For each point, include numbered list of nearest words in hovertext
const hovertext = plotWords.map(target =>
`Reference word:
${target}
` +
"Nearest words:
" +
nearestWords.get(target)
.map((word, i) => `${i+1}. ${word}`)
.join("
")
);
const data = [
{
x: x,
y: y,
z: z,
mode: "markers+text",
type: "scatter3d",
marker: {
size: 4,
opacity: 0.8,
color: color
},
text: plotWords,
hoverinfo: "text",
hovertext: hovertext
}
];
const ZOOM = 0.8;
// save previous camera code (workaround for #9)
let camera;
if (newPlot) {
camera = {eye: {x: -2.5*ZOOM, y: -0.75*ZOOM, z: 0.5*ZOOM}};
} else { // save camera
const plotly_scatter = document.getElementById("plotly-scatter");
camera = plotly_scatter.layout.scene.camera;
}
console.log("Using camera", camera);
const layout = {
title: {text: "Word vector projection"},
//uirevision: "true",
scene: {
xaxis: {title: "Residual", dtick: 0.1},
yaxis: {title: "Gender", dtick: 0.1},
zaxis: {title: "Age", dtick: 0.1},
camera: camera
},
margin: {l:0, r:0, t:30, b:0}, // maximize viewing area
font: {size: 12}
};
// always make new plot (#9)
// replotting scatter3d produces ugly error (#10)
Plotly.newPlot("plotly-scatter", data, layout);
// bind scatter click event
let plotly_scatter = document.getElementById("plotly-scatter");
plotly_scatter.on("plotly_click", (data) => {
const ptNum = data.points[0].pointNumber;
const clickedWord = plotWords[ptNum];
if (clickedWord === selectedWord) { // deselect
selectedWord = "";
console.log("Deselected", clickedWord);
} else { // select
selectedWord = clickedWord;
console.log("Selected", selectedWord);
}
// replot with new point color
plotScatter();
});
}
function selectAxis(axis) {
// TODO: cleanup
console.log("button", axis);
const axisNames = ["[age]", "[gender]"];
if (selectedWord === axisNames[axis]) { // deselect word
selectedWord = "";
} else { // select word
selectedWord = axisNames[axis];
}
// TODO: move updating button color to own function that is also called on scatter click
for (const i of [0,1]) {
const buttonID = "scatter-button" + i;
document.getElementById(buttonID).style.color = (selectedWord === axisNames[i]) ? "red" : "black";
}
plotScatter(); // replot selected word
}
function updateHeatmapsOnWordClick() {
// affects all heatmaps since they all have .yaxislayer-above!
// https://stackoverflow.com/a/47400462
console.log("Binding heatmap click event");
d3.selectAll(".yaxislayer-above").selectAll("text")
.on("click", (d) => {
const idx = d.target.__data__.x;
console.log("Clicked on", idx);
if (selectedWord) {
// modify vector view to show selected word and then deselect
vectorWords[idx] = selectedWord;
selectedWord = "";
// replot all
plotScatter();
plotVector();
}
});
}
// plot vector and magnify views
function plotVector(newPlot=false) {
// heatmap plots matrix of values in z
const z = vectorWords.map(word => vecs.get(word));
const data = [
{
// can't use y: vectorWords since the heatmap won't display duplicate words
z: z,
zmin: HEATMAP_MIN,
zmax: HEATMAP_MAX,
type: "heatmap",
ygap: 5
}
];
const layout = {
title: {text: "Vector visualization"},
xaxis: {
title: "Vector dimension",
dtick: 10,
zeroline: false,
fixedrange: true
},
yaxis: {
title: "Words",
tickvals: d3.range(vectorWords.length),
ticktext: vectorWords,
fixedrange: true,
tickangle: 60
},
margin: {t:30},
//dragmode: false
};
if (newPlot) {
Plotly.newPlot("plotly-vector", data, layout);
const plotly_vector = document.getElementById("plotly-vector");
// bind axis click to replace word in vector display after plot
plotly_vector.on("plotly_afterplot", updateHeatmapsOnWordClick);
plotly_vector.on("plotly_hover", data => {
hoverX = data.points[0].x;
console.log("Hover " + hoverX);
plotMagnify();
});
plotMagnify(true);
}
else {
Plotly.react("plotly-vector", data, layout);
plotMagnify();
}
}
function plotMagnify(newPlot=false) {
// ensure hoverX will produce proper plot
// bounds are inclusive
const lo = hoverX - MAGNIFY_WINDOW;
const hi = hoverX + MAGNIFY_WINDOW;
if (!(0 <= lo && hi < vecsDim))
return;
// heatmap with subset of z
const z = vectorWords.map(word =>
vecs.get(word).slice(lo, hi + 1));
const data = [
{
x: d3.range(lo, hi + 1),
z: z,
zmin: HEATMAP_MIN,
zmax: HEATMAP_MAX,
type: "heatmap",
ygap: 5,
showscale: false
}
];
const layout = {
title: {text: "Magnified view"},
xaxis: {
title: "Vector dimension",
dtick:1,
zeroline: false,
fixedrange: true
},
yaxis: {
//title: "Words",
tickvals: d3.range(vectorWords.length),
ticktext: vectorWords,
fixedrange: true,
tickangle: 60
},
margin: {r:5, t:30} // get close to main vector view
};
if (newPlot) {
Plotly.newPlot("plotly-magnify", data, layout);
// bind axis click after plot, similar to vector
const plotly_magnify = document.getElementById("plotly-magnify");
plotly_magnify.on("plotly_afterplot", updateHeatmapsOnWordClick);
}
else Plotly.react("plotly-magnify", data, layout);
}
function modifyWord() {
const word = document.getElementById("modify-word-input").value;
let wordModified = false;
if (scatterWords.includes(word)) { // remove word
scatterWords = scatterWords.filter(item => item !== word);
document.getElementById("modify-word-message").innerText = `"${word}" removed`;
selectedWord = ""; // remove selected word
wordModified = true;
}
else { // add word if in wordvecs
if (vecs.has(word)) {
scatterWords.push(word);
document.getElementById("modify-word-message").innerText = `"${word}" added`;
selectedWord = word; // make added word selected word
wordModified = true;
}
else { // word not found
document.getElementById("modify-word-message").innerText = `"${word}" not found`;
// no replot or change to selected word
}
}
if (wordModified) {
plotScatter(); // replot to update scatter view
document.getElementById("modify-word-input").value = ""; // clear word
}
}
// compute 3COSADD word analogy
// also write arithmetic vectors to vector view and add nearest neighbors to result (#14)
// "Linguistic Regularities in Continuous Space Word Representations" (Mikolov 2013)
// Analogy notation for words: a:b as c:d, with d unknown
// vector y = x_b - x_a + x_c, find w* = argmax_w cossim(x_w, y)
function processAnalogy() {
const wordA = document.getElementById("analogy-word-a").value;
const wordB = document.getElementById("analogy-word-b").value;
const wordC = document.getElementById("analogy-word-c").value;
const inputWords = [wordA, wordB, wordC];
// TODO: handle more gracefully telling user if words not available
if (!(vecs.has(wordB) && vecs.has(wordA) && vecs.has(wordC))) {
console.warn("bad word");
return;
}
const vecA = vecs.get(wordA);
const vecB = vecs.get(wordB);
const vecC = vecs.get(wordC);
// vector arithmetic, scale to unit vector
const vecBMinusA = vecB.sub(vecA);
const wordBMinusA = `${wordB}-${wordA}`;
const vecY = vecBMinusA.add(vecC).unit();
const wordY = `${wordB}-${wordA}+${wordC}`;
// find most similar words for analogy
let wordAnalogyPairs = [...vocab]
.filter(word => !inputWords.includes(word)) // don't match words used in arithmetic (#12)
.map(word => [word, vecY.dot(vecs.get(word))]);
wordAnalogyPairs.sort((a,b) => b[1] - a[1]);
const nearestAnalogyWords = wordAnalogyPairs.slice(0, 10).map(pair => pair[0]);
const wordWstar = nearestAnalogyWords[0];
// add nearest words to Y to nearest word list (#12)
nearestWords.set(wordY, nearestAnalogyWords);
// write out most similar word to text box
document.getElementById("analogy-word-wstar").value = wordWstar;
// write arithmetic vectors to vector view
vecs.set(wordBMinusA, vecBMinusA);
vecs.set(wordY, vecY);
// set analogy words to display in scatter (#12) in specific order:
analogyScatterWords = [wordB, wordA, wordC, wordY, wordWstar];
plotScatter();
// write arithmetic vectors to vector view (#14)
vectorWords = [wordB, wordA, wordBMinusA, wordC, wordY, wordWstar].reverse();
plotVector();
}
// inflate option to:"string" freezes browser, see https://github.com/nodeca/pako/issues/228
// TextDecoder may hang browser but seems much faster
function unpackVectors(vecsBuf) {
return new Promise((resolve, reject) => {
const vecsUint8 = pako.inflate(vecsBuf);
const vecsText = new TextDecoder().decode(vecsUint8);
return resolve(vecsText);
});
}
// fill in default words used to define semantic dimensions for scatterplot
function fillDimensionDefault() {
document.getElementById("user-dimension-feature1-set1").textContent =
"man\nking\nprince\nhusband\nfather\nson\nuncle\nnephew\nboy\nmale";
document.getElementById("user-dimension-feature1-set2").textContent =
"woman\nqueen\nprincess\nwife\nmother\ndaughter\naunt\nniece\ngirl\nfemale";
document.getElementById("user-dimension-feature2-set1").textContent =
"man\nwoman\nking\nqueen\nfather\nmother\nuncle\naunt";
document.getElementById("user-dimension-feature2-set2").textContent =
"boy\ngirl\nprince\nprincess\nson\ndaughter\nnephew\nniece";
}
function processDimensionInput() {
const feature1Set1Input = document.getElementById("user-dimension-feature1-set1").value.split('\n');
const feature1Set2Input = document.getElementById("user-dimension-feature1-set2").value.split('\n');
const feature2Set1Input = document.getElementById("user-dimension-feature2-set1").value.split('\n');
const feature2Set2Input = document.getElementById("user-dimension-feature2-set2").value.split('\n');
// TODO: user validation
feature1Set1 = feature1Set1Input;
feature1Set2 = feature1Set2Input;
feature2Set1 = feature2Set1Input;
feature2Set2 = feature2Set2Input;
console.log(feature1Set1, feature1Set2, feature2Set1, feature2Set2);
}
// fetch wordvecs locally (no error handling) and process
async function main() {
// fill default feature for scatterplot
fillDimensionDefault();
// lo-tech progress indication
const loadingText = document.getElementById("loading-text");
loadingText.innerText = "Downloading model...";
// note python's http.server does not support response compression Content-Encoding
// browsers and servers support content-encoding, but manually compress to fit on github (#1)
const vecsResponse = await fetch("wordvecs50k.vec.gz");
const vecsBlob = await vecsResponse.blob();
const vecsBuf = await vecsBlob.arrayBuffer();
// async unpack vectors
loadingText.innerText = "Unpacking model...";
const vecsText = await unpackVectors(vecsBuf);
loadingText.innerText = "Processing vectors...";
processRawVecs(vecsText);
// fetch nearest words list
const nearestWordsResponse = await fetch("nearest_words.txt");
const nearestWordsText = await nearestWordsResponse.text();
nearestWords = processNearestWords(nearestWordsText);
loadingText.innerText = "Model processing done";
processDimensionInput();
// plot new plots for the first time
plotScatter(true);
plotVector(true);
}
// Main function runs as promise after DOM has loaded
document.addEventListener("DOMContentLoaded", () => {
main().catch(e => console.error(e));
});
```#qcStackCode#发布于 2021-08-07 21:23:51
这是一份相当大的意见书,所以我要把重点放在我认为最重要的事情上。
我从浏览代码中得到的感觉:
我认为它的不足之处主要是在体系结构上(你用你对globals的评论来暗示这一点),也有一点关于评论。
processRawVecs和processNearestWords在名称中使用了"process“一词。前者什么也不回。它将结果存储在全局变量中。后者是一种纯函数。这对我来说是一个信号,表明“过程”这个词没有任何特殊的含义。它也可以是"doStuff“。我建议将所有纯函数名更改为更有形的名称,如getNearestWords。目标是为不同的事物创建单独的抽象。processRawVecs也可以作为纯函数来编写。可以将添加到全局状态的部分移出此函数。事实上,如果您在所有函数中都积极地这样做,那么大多数逻辑都包含在纯函数中,而其余的代码只是组织状态更改。如果您在这里结束,您很可能不需要全局变量(它们可以进入一个函数或类整数),但是即使您这样做了,接触它们的地方也会更少(全局变量的大多数用法将作为函数参数,而不是全局变量),这种技术被称为"functional,命令式shell“。plotScatter是另一个例子。有相当多的逻辑,所有这些都不能运行不同时显示的情节。这样做的一个后果是很难或不可能孤立地进行测试。如果很难孤立地进行测试,那么通常也很难孤立地考虑。总结一下:我建议您组织代码,以便
结果希望您能够更好地控制全局状态,以及更易于测试和可理解的代码。当然,好处将取决于最终应用程序的大小。
// to be used for naming features是好的,因为否则我会删除下面未使用的变量。我认为大多数评论都是抽象不清、函数缺失、函数名称太短、或者只是没有必要的说明。片段1:
// create feature dimension vectors
function createFeature问这个评论为什么要存在是很有趣的。我至少可以看到三种可能性:
片段2:
// populate feature vectors
feature1 = createFeature(vecs, feature1Set1, feature1Set2);我再一次想知道这个评论的目的是什么。
还有一堆这样的。我建议阅读这些评论,看看它们是否必要。可能会更改几个变量名,并提取一两个函数。我认为这将提高可读性和降低噪音。
https://codereview.stackexchange.com/questions/265802
复制相似问题