How to find and collect all anagrams of 500k words and somewhat efficiently store the data in JSON (JavaScript)?

Lance
March 22, 2023
311 views
0 votes
2 Answers

I have this code for generating anagrams, which seems to work as expected. It takes an input word and finds all subwords that can be generated from its letters, down to 3 letter words. However, it is painfully slow (10k words processed in 5 minutes, which is about 4 hours of runtime), and it results in extremely large JSON files.

const terms = require('../configurations/terms.json')
const fs = require('fs')
const _ = require('lodash')

function* getCombinations(array, k, taken=[]) {
    if (k == 0) return yield taken;
    if (array.length < k) return;
    yield* getCombinations(array.slice(1), k - 1, taken.concat(array[0]));
    yield* getCombinations(array.slice(1), k, taken);
}

const output = {}

let i = 0

let trie = {}

terms.forEach(input => {
  // frequencies[input.slug] ??= getFrequencies(input.slug)

  const letters = [...input.slug].sort()

  let node = { children: trie }

  letters.forEach(letter => {
    node = node.children[letter] = node.children[letter] ?? { word: {}, children: {} }
  })

  node.word[input.slug] = true

  i++

  // if (i % 1000 == 0) {
  //   console.log(i)
  // }
})

terms.forEach((input, xi) => {
  const letters = [...input.slug]

  if (xi % 10 === 0) {
    console.log(xi)
    fs.writeFileSync('configurations/term.anagram.en.json', JSON.stringify(output, null, 2))
  }

  const combinations = {}

  let i = letters.length
  while (i >= 3) {
    const array = [...getCombinations(letters, i)]
    array.forEach(item => combinations[item.sort().join('')] = true)
    i--
  }

  const matches = {}

  output[input.slug] = []

  for (const combination in combinations) {
    const match = findMatch([...combination])
    if (match) {
      output[input.slug].push(combination)
    }
  }
})

function findMatch(letters) {
  let node = { children: trie }
  let i = 0
  while (node && i < letters.length) {
    node = node.children[letters[i++]]
  }
  return Boolean(node?.word)
}

// terms.forEach((input, xi) => {
//   const letters = [...input.slug]

//   if (xi % 10 === 0) {
//     console.log(xi)
//     fs.writeFileSync('configurations/term.anagram.en.json', JSON.stringify(output, null, 2))
//   }

//   const combinations = []

//   let i = letters.length
//   while (i >= 3) {
//     const array = [...getCombinations(letters, i)]
//     array.forEach(item => combinations.push(item))
//     i--
//   }

//   const matchers = combinations.map(x => x.sort())

//   const matches = {}

//   matchers.forEach(matcher => {
//     const match = findMatch(matcher)
//     output[matcher] = match
//     matches[matcher] = true
//   })

//   output[input.slug] = Object.keys(matches)
// })

The terms can be simulated with this ~500k word list.

So for example, if the input word is:

interesting

The "subwords" are:

rest
set
sing
tint
get
resting
nesting
getter
...

Etc.. I only care about finding all words down to 3 letter words, so in, while it is a match, wouldn’t be included in the output.

The problem starts to be when it finds all things like say "yee, eye, eey" are all words (combinations of the letters), the 500k word pile has all kinds of junk words leading to an explosion of short words with different combinations of letters. I tried to cut that out in my example, so to "hydrate" all the possibilities you would first fetch the "initial list" given a word as a key, then for each word in the sublist, recursively use that as a key and find the subwords for that, etc.. I can’t see in my mind yet if this would work properly, but it seems close.

Is there any way you can think of to optimize this, both in terms of runtime speed and disk JSON size? Pretty printed JSON was over 5 million lines at 10k words processed, so that’s a lot IMO. How can this be optimized and stored so it doesn’t take up over let’s say 50MB of space? If speed can be brought down below the current ~4 hour mark, can it be brought down to less than a minute? Or even a half an hour?

Answers

Chosen as BEST ANSWER

Here is something I tinkered with and landed on, I'm not 100% sure it's exactly what I needed, but it's pretty close.

It gives 2 JSON files, a mapping of sorted words to words, and a mapping of sorted words to sorted child words. You recursively need to traverse the second JSON contents, and gather the words from the 1st file as you go, and perhaps dedup. Not perfect, but it ran in under 5 minutes and I think it should be a reasonable file size.

const terms = require('../configurations/terms.json')
const fs = require('fs')
const _ = require('lodash')

function* getCombinations(array, k, taken=[]) {
    if (k == 0) return yield taken;
    if (array.length < k) return;
    yield* getCombinations(array.slice(1), k - 1, taken.concat(array[0]));
    yield* getCombinations(array.slice(1), k, taken);
}

let i = 0

let trie = {}

terms.forEach(input => {
  // frequencies[input.slug] ??= getFrequencies(input.slug)

  const letters = [...input.slug].sort().join('')

  trie[letters] ??= []
  trie[letters].push(input.slug)

  i++

  // if (i % 1000 == 0) {
  //   console.log(i)
  // }
})

fs.writeFileSync('tmp/unscramble.trie.json', JSON.stringify(trie, null, 2))

const globalMap = {}
terms.forEach((term, i) => {
  const word = [...term.slug].sort()
  recursivelyProcessWord(word, globalMap, word)

  if (i % 1000 == 0) {
    console.log(i)
    fs.writeFileSync('tmp/unscramble.json', JSON.stringify(globalMap, null, 2))
  }
})

function recursivelyProcessWord(word, globalMap, parentWord) {
  if (!word.length) {
    return
  }

  const sortedText = parentWord.join('')

  if (globalMap[sortedText]) {
    return
  }

  const sorted = {}

  const combinations = [...getCombinations(word, word.length - 1)]

  const uniqCombinations = _.uniqBy(combinations, x => x.sort().join('')).map(x => x.sort())

  uniqCombinations.forEach(combination => {
    const combinationText = combination.sort().join('')
    if (sorted[combinationText]) {
      return
    }
    const words = trieContains(combination)
    if (words.length) {
      sorted[combinationText] = true
      recursivelyProcessWord(combination, globalMap, combination)
    } else {
      recursivelyProcessWord(combination, globalMap, word)
    }
  })

  globalMap[sortedText] ??= {}

  _.merge(globalMap[sortedText], sorted)
}

function trieContains(letters) {
  return trie[letters.join('')] ?? []
}

(Edit)

- Graffito
- March 24, 2023 at 11:15 am
- 0 votes
0
To optimize the computation :
- assign to each of the 500 K words of initial list an int32 key, where
  the 26 first bytes indicating whether or not the word contains the
  letter [A..Z], the 27th for the character "-" and the 28th for any
  other char of the word.
- store the keys in a 500 K int32 table in parallel with words.
- compute the input_key corresponding to input word.
- loop on key table and ignore the cases not matching following
  condition : table_key[i] = table_key[i] AND input_key
Login or Signup to reply.

Please signup or login to give your own answer.

Click here to cancel reply.