Prime Pairs code for Goldbach's Conjecture

I've hit my frustration limit, and would really like some help in improving this code.

The code finds all the prime pairs that sum to n for any even n > 2.
If you want an explanation of how|why it works read the following paper.

Proof of Goldbach's Conjecture and Bertrand's Postulate Using Prime Genrator Theory(GPT)

I'm sorry the Rust code is so bad, I'm not very proficient in it.
It works (gives correct results) but is horribly slow compared to the D|Crystal versions.
I suspect someone who knows Rust can make it the fastest.

Here is the D version (fastest so far).

// Compile with ldc2: $ ldc2 --release -O3 -mcpu native prime_pairs_lohi.d
// Run as: $ ./prime_pairs_lohi_u32 123_456

module prime_pairs;

import std;
import std.datetime.stopwatch : StopWatch;

void prime_pairs_lohi(uint n) {     // inputs can be of size u32
  if ((n&1) == 1 || n < 4) { return writeln("Input not even n > 2"); }
  if (n <= 6) { writeln([n, 1]); writeln([n/2, n/2]); writeln([n/2, n/2]); return; }

  // generate the low-half-residues (lhr) r < n/2
  auto ndiv2 = n/2;                 // llr:hhr midpoint
  auto rhi   = n-2;                 // max residue limit
  uint[] lhr = iota(3, ndiv2, 2).filter!(e => gcd(e, n) == 1).array;

  // identify and store all powers and cross-products of the lhr members < n-2
  uint[] lhr_mults;                 // lhr multiples, not part of a pcp
  foreach(i, r; lhr) {
    auto rmax = rhi / r;            // ri can't multiply r with values > this
    if (r < rmax) lhr_mults ~= r*r; // for r^2 multiples
    if (lhr[i+1] > rmax) break  ;   // exit if product of consecutive r’s > n-2
    foreach(ri; lhr[i+1..$]) {      // for each residue in reduced list
      if (ri > rmax) break;         // exit for r if cross-product with ri > n-2
      lhr_mults ~= r * ri;          // store value if < n-2
  } }

  // convert lhr_mults vals > n/2 to their lhr complements n-r,
  // store them, those < n/2, in lhr_del; it now holds non-pcp lhr vals
  auto lhr_del = lhr_mults.map!((r_del) => r_del > ndiv2 ? n - r_del : r_del).array;
  lhr_del.sort!("a < b");
  lhr = setDifference(lhr, lhr_del).array;

  writeln([n,     lhr.length]);     // show n and pcp prime pairs count
  writeln([lhr[0],  n-lhr[0]]);     // show first pcp prime pair of n
  writeln([lhr[$-1],n-lhr[$-1]]);   // show last  pcp prime pair of n
}

void main(string[] args) {          // directly recieve input from terminal
  string[] inputs = args[1..$];     // can include '_': 123_456
  auto nums = inputs.map!(i => i.filter!(n => n != '_'));
  auto n    = nums.map!(f => f.to!uint())[0];

  auto timer = StopWatch();         // create execution timer
  timer.start();                    // start it
  prime_pairs_lohi(n);              // run routine
  writeln(timer.peek());            // show timer results
}

Here's the Crystal version (a bit slower).

# Compile: $ crystal build --release --mcpu native prime_pairs_lohi2.cr
# Run as:  $ ./primes_pairs_lohi n 123_456_780

def prime_pairs_lohi(n)
  return puts "Input not even n > 2" unless n.even? && n > 2
  return (pp [n, 1]; pp [n//2, n//2]; pp [n//2, n//2]) if n <= 6

  # generate the low-half-residues (lhr) r < n/2
  lhr = 3u64.step(to: n//2, by: 2).select { |r| r if r.gcd(n) == 1 }.to_a
  ndiv2, rhi = n//2, n-2           # lhr:hhr midpoint, max residue limit

  # store all powers and cross-products of the lhr members < n-2
  lhr_mults = [] of typeof(n)      # lhr multiples, not part of a pcp
  lhr_dup = lhr.dup                # make copy of the lhr members list
  while (r = lhr_dup.shift) && !lhr_dup.empty? # do mults of 1st list r w/others
    rmax = rhi // r                # ri can't multiply r with values > this
    lhr_mults << r * r if r < rmax # for r^2 multiples
    break if lhr_dup[0] > rmax     # exit if product of consecutive r’s > n-2
    lhr_dup.each do |ri|           # for each residue in reduced list
      break if ri > rmax           # exit for r if cross-product with ri > n-2
      lhr_mults << r * ri          # store value if < n-2
    end                            # check cross-products of next lhr member
  end

  # remove from lhr its lhr_mults, convert vals > n/2 to lhr complements first
  lhr -= lhr_mults.map { |r_del| r_del > ndiv2 ? n - r_del : r_del }

  pp [n, lhr.size]                 # show n and pcp prime pairs count
  pp [lhr.first, n-lhr.first]      # show first pcp prime pair of n
  pp [lhr.last,  n-lhr.last]       # show last  pcp prime pair of n
end

def gen_pcp
  n = (ARGV[0].to_u64 underscore: true) # get n input from terminal
  t1 = Time.monotonic             # start execution timing
  prime_pairs_lohi(n)             # execute code
  pp Time.monotonic - t1          # show execution time
end

gen_pcp

And here's my current Rust version.

use std::time::Instant;

fn coprime(mut m: usize, mut n: usize) -> bool {
  while m|1 != 1 { let t = m; m = n % m; n = t }
  m > 0
}

fn prime_pairs_lohi(n : usize) {
  if n&1 == 1 || n < 4 { return println!("Input not even n > 2"); }
  if n <= 6 { return println!("[{}, {}]\n[{}, {}]\n[{}, {}]",n,1,n/2,n/2,n/2,n/2); };

  // generate the low-half-residues (lhr) r < n/2
  let (ndiv2, rhi) = (n/2, n-2);          // lhr:hhr midpoint, max residue limit
  let mut lhr: Vec<usize> = vec![];       // to store low half residues
  for r in (3..ndiv2).step_by(2) { if coprime(r, n) { lhr.push(r) } };

  // indentify and store all powers and cross-products of the lhr members < n-2
  let mut lhr_mults: Vec<usize> = vec![]; // lhr multiples, not part of a pcp
  let mut lhr_dup = lhr.clone();
  let mut r = lhr_dup[0];
  lhr_dup.remove(0);
  while !lhr_dup.is_empty() {
    let rmax = rhi / r;                  // ri can't multiply r with values > this
    if lhr_dup[0] > rmax { break }       // exit if product of consecutive r’s > n-2
    if r < rmax { lhr_mults.push(r * r)} // for r^2 multiples
    for ri in &lhr_dup {                 // for each residue in reduced list
      if *ri > rmax { break }            // exit for r if cross-product with ri > n-2
      lhr_mults.push(r * ri);            // store value if < n-2
    }
    r = lhr_dup[0];
    lhr_dup.remove(0);
  }

  // convert lhr_mults vals > n/2 to their lhr complements n-r,
  // store them, those < n/2, in lhr_del; it now holds non-pcp lhr vals
  let lhr_del: Vec<usize> = lhr_mults.into_iter().map(|r| if r > ndiv2 {n - r} else {r}).collect();

  for r in lhr_del { lhr.retain(|&m| m != r)};      // remove lhr_mults values from lhr
  let lcnt = lhr.len();                             // number of lhr primes left

  println!("[{}, {}]", n, lcnt);                    // show n and pcp prime pairs count
  println!("[{}, {}]", lhr[0],n-lhr[0]);            // show first pcp prime pair of n
  println!("[{}, {}]", lhr[lcnt-1], n-lhr[lcnt-1]); // show last  pcp prime pair of n
}

fn main() {
  let mut val = String::new();
  std::io::stdin().read_line(&mut val).expect("Failed to read line");
  let n: usize = val.trim().parse().unwrap();

  let start = Instant::now();
  prime_pairs_lohi(n);
  println!("{:?}", start.elapsed());
}

Thanks in advance for any help and suggestions.

First of all, did you build it with --release? This alone can bring an orders of magnitude difference.

3 Likes

I didn't read the full algorithm yet, but just a quick observation:

this loop has quadratic time complexity. for Vec, it is most efficient to remove values from the back, i.e. pop(). or better, you don't need to remove elements at all, if you just iterate a sub-slice of the data. or better yet, you don't need to make the clone lhr_dup at all, you can iterate over sub-slice of the original lhr.

8 Likes

If lhr_del is sorted you can do the retain without iteration by using binary_search().is_none() in the closure. Or you use a hashset which is what your other solutions do (I think?).

1 Like

What sort of numbers are you seeing? I get about:

10,000: 1.4ms
100,000: 110ms
1,000,000: 11s

I have no idea if that's good in comparison to the others.

I tried skipping creating and mutating lhr_dup to avoid the per-item mutation, but it made barely any difference:

  for i in 1..lhr.len() {
    let r = lhr[i-1];
    let rmax = rhi / r;                   // ri can't multiply r with values > this
    if lhr[i] > rmax { break }            // exit if product of consecutive r’s > n-2
    if r < rmax { lhr_mults.push(r * r)}  // for r^2 multiples
    for ri in lhr[i..].iter().copied() {  // for each residue in reduced list
      if ri > rmax { break }              // exit for r if cross-product with ri > n-2
      lhr_mults.push(r * ri);             // store value if < n-2
    }
  }

Before:

1000000
[1000000, 5402]
[17, 999983]
[499943, 500057]
11.8402197s

After:

1000000
[1000000, 5402]
[17, 999983]
[499943, 500057]
11.567099s

(I also double-checked with unsafe { *lhs.get_unchecked() } to ensure indexing wasn't pessimising and it made it slightly worse, so it seems LLVM is fine stripping the range checks here)


Ring a ding-ding!

  let lhr_del: std::collections::HashSet<usize> = lhr_mults.into_iter().map(|r| if r > ndiv2 {n - r} else {r}).collect();

  lhr.retain(|&m| lhr_del.get(&m).is_none());            // remove lhr_mults values from lhr

Gives:

1000000
[1000000, 5402]
[17, 999983]
[499943, 500057]
29.535ms
3 Likes

for one, lhr_mults can be much bigger than lhr, so the retain() loop is dominating. what's more, lhr_mults can contain duplicated values. using HashSet fixed both of these issues. in fact, it doesn't need to be a Vec to begin with.

although I'm not a mathematician, I'm a bit skeptical reading it.

I might be completely wrong, but this paper just gives this algorithm, but didn't "prove" anything at all.

the paper simply re-stated GC as, to quote:

Every even integer n > 6 has at least one pcp prime pair.

but it never "proved" the existence of such pair. the whole "proof" sounds like just circular argument. and the algorithm based on this theory cannot be guaranteed to always succeed. in pratice though, since GC is already checked to hold for very large numbers, there's no reason this algorithm would fail.

at the end of the day, the algorithm is nothing but another numerical procedure to verify GC, not prove it.

1 Like

Using this Rust version I show timing comparisons with D and Crystal.

The main Rust slowness is how it removes vec elements.

use std::time::Instant;

fn coprime(mut m: usize, mut n: usize) -> bool {
  while m|1 != 1 { let t = m; m = n % m; n = t }
  m > 0
}

fn prime_pairs_lohi(n : usize) {
  if n&1 == 1 || n < 4 { return println!("Input not even n > 2"); }
  if n <= 6 { return println!("[{}, {}]\n[{}, {}]\n[{}, {}]",n,1,n/2,n/2,n/2,n/2); };

  // generate the low-half-residues (lhr) r < n/2
  let (ndiv2, rhi) = (n/2, n-2);          // lhr:hhr midpoint, max residue limit
  let mut lhr: Vec<usize> = vec![];       // to store low half residues
  for r in (3..ndiv2).step_by(2) { if coprime(r, n) { lhr.push(r) } };

  // indentify and store all powers and cross-products of the lhr members < n-2
  let mut lhr_mults: Vec<usize> = vec![]; // lhr multiples, not part of a pcp
  for i in 1..lhr.len() {                 // iterate thru lhr to find prime multiples
    let r = lhr[i-1];                     // for current residue
    let rmax = rhi / r;                   // ri can't multiply r with values > this
    if r < rmax { lhr_mults.push(r * r) } // for r^2 multiples
    if lhr[i] > rmax { break }            // exit if product of consecutive r’s > n-2
    for ri in lhr[i..].iter().copied() {  // for each residue in reduced list
      if ri > rmax { break }              // exit for r if cross-product with ri > n-2
      lhr_mults.push(r * ri);             // store value if < n-2
  } }

  // convert lhr_mults vals > n/2 to their lhr complements n-r,
  // store them, those < n/2, in lhr_del; it now holds non-pcp lhr vals
  let lhr_del: std::collections::HashSet<usize> = lhr_mults.into_iter().map(|r| if r > ndiv2 {n - r} else {r}).collect();
  lhr.retain(|&m| lhr_del.get(&m).is_none());

  let lcnt = lhr.len();
  println!("[{}, {}]", n, lcnt);                    // show n and pcp prime pairs count
  println!("[{}, {}]", lhr[0],n-lhr[0]);            // show first pcp prime pair of n
  println!("[{}, {}]", lhr[lcnt-1], n-lhr[lcnt-1]); // show last  pcp prime pair of n
}

fn main() {
  let mut val = String::new();
  std::io::stdin().read_line(&mut val).expect("Failed to read line");
  let n: usize = val.trim().parse().unwrap();

  let start = Instant::now();
  prime_pairs_lohi(n);
  println!("{:?}", start.elapsed());
}

Timing comparisons with D and Crystal versions.

n = 1_000_000
D
➜  ~ ./prime_pairs_lohi_d 1_000_000
[1000000, 5402]
[17, 999983]
[499943, 500057]
49 ms, 28 μs, and 8 hnsecs

Crystal
➜  ~ ./prime_pairs_lohi_cr 1_000_000
[1000000, 5402]
[17, 999983]
[499943, 500057]
00:00:00.035435467

Rust
➜  ~ echo 1000000 | ./prime_pairs_lohi_rs
[1000000, 5402]
[17, 999983]
[499943, 500057]
214.114792ms


n = 100_000_000

D
➜  ~ ./prime_pairs_lohi_d 100_000_000
[100000000, 291400]
[11, 99999989]
[49999757, 50000243]
7 secs, 607 ms, 306 μs, and 4 hnsecs

Crystal
➜  ~ ./prime_pairs_lohi_cr 100_000_000
[100000000, 291400]
[11, 99999989]
[49999757, 50000243]
00:00:11.015920238

Rust
➜  ~ echo 100000000 | ./prime_pairs_lohi_rs
[100000000, 291400]
[11, 99999989]
[49999757, 50000243]
63.489022516s

In Crystal to remove elements of one array from another you can do just:

lhr - lhr_del => lhr -= lhr_del

In D you remove one set members from a larger set.

lhr_del.sort!("a < b");
lhr = setDifference(lhr, lhr_del).array;

I couldn't find any comparable fast idioms in Rust.
I assume there are similar faster ways to do this in Rust.
At least I hope there are.

Please study the paper.
It not only proves every even n > 6 has at lease one prime complement pair (pcp) prime pair, it also establishes a dynamically adjusting tighter bound on p than given by Bertrand's Postulate. And this may be the most important thing in the paper, because that has huge consequences (for astute number theorists).

The code isn't a proof of the GC, it's the manifestation of the proof of it!
How can such a short and simple program identify millions of prime pairs for n?
It's because all the coprime primes < n are the residues of modular groups Zn.
And the residues of Zn have very defined and deterministic properties, which I explain.

It's all in the paper. But you have to approach it with an open mind, because most people have never been exposed to its math before.

That operation is HashSet::difference

Using HashSet::difference is actually a bit slower.

This using HashSet

  // convert lhr_mults vals > n/2 to their lhr complements n-r,
  // store them, those < n/2, in lhr_del; it now holds non-pcp lhr vals
  let lhr_del: HashSet<usize> = lhr_mults.into_iter().map(|r| if r > ndiv2 {n - r} else {r}).collect();
  let a:HashSet<_> = lhr.into_iter().collect();
  let mut lhr: Vec<_> = a.difference(&lhr_del).collect();
  lhr.sort();

is slower than original code using

  // convert lhr_mults vals > n/2 to their lhr complements n-r,
  // store them, those < n/2, in lhr_del; it now holds non-pcp lhr vals
  let lhr_del: std::collections::HashSet<usize> = lhr_mults.into_iter().map(|r| if r > ndiv2 {n - r} else {r}).collect();
  lhr.retain(|&m| lhr_del.get(&m).is_none())

But this in Python3 is faster:

    # remove from lhr its lhr_mults, convert vals > n/2 to lhr complements first
    lhr_del = { (r if r < ndiv2 else n - r) for r in lhr_mults }
    lhr = [r for r in lhr if r not in lhr_del]

It seems HashSet is still removing individual elements one at a time vs the other languages doing bulk removal of all elements at once. Or at least that's what it seems is happening.

Default HashSet uses a secure hashing algorithm to prevent DoS attacks, you can switch to a faster hashing algorithm like the ahash or foldhash crates (or just switch to the hashbrown crate which is the current implementation of std HashSet, which uses the latter by default) - I probably should have mentioned this, sorry!

Or you could instead try BTreeSet which is slower on average but keeps the items in order so difference is likely significantly better (since it doesn't have to look up every item in the latter set), but it may be a wash since building it would probably be more expensive (depends on the access patterns)

There's almost certainly higher level algorithmic changes to squeeze out more performance here, but for now Rust shouldn't be being beaten by Python, no matter how clever their implementations are.

2 Likes

The real question for me is not about using HashSet or any alternative.
I want to remove the members of one array|vec from another in the fastest way possible.

In Ruby|Crystal its: lhr -= lhr_del

In D its: lhr_del.sort!("a < b"); lhr = setDifference(lhr, lhr_del).array;

In Pyrhon its: lhr = [r for r in lhr if r not in lhr_del]

These are all very fast.
I would think there must be a fast way Rust does this as well.

Here are some suggested changes.

  • Faster hasher
  • Got rid of an unneeded intermediate Vec (lhr_mults)
  • Reduced indexing

It runs faster in the playground. I didn't do any rigorous testing so I don't know which were most helpful (or hurtful or irrelevant). You could try them piecemeal yourself.

If you have a formula that upper-limits the size of any collection, you can use with_capacity to ensure no reallocations.

I did not look through @quinedot's suggestions, but your original rust program ran for me in 125ms using the input 123456. After replacing

for r in lhr_del { lhr.retain(|&m| m != r)};

by

lhr_del.sort();
lhr.retain(|m| lhr_del.binary_search(&m).is_ok());

the runtime went down to 15ms. I didn't really thoroughly benchmark though, but I did run the program several times :slight_smile:

(e) I was able to bring that down to 7-8ms by the change indicated in the following code snippet (stuff commented out so one can see what was the original line:)

  // indentify and store all powers and cross-products of the lhr members < n-2
  let mut lhr_mults: Vec<usize> = vec![]; // lhr multiples, not part of a pcp
  //let mut lhr_dup = lhr.clone();
  let mut lhr_dup_inv: Vec<usize> = lhr.iter().copied().rev().collect();
  //let mut r = lhr_dup[0];
  let mut r: usize = lhr_dup_inv.pop().unwrap();
  //lhr_dup.remove(0);
  while !lhr_dup_inv.is_empty() {
    let rmax = rhi / r;                  // ri can't multiply r with values > this
    let last = lhr_dup_inv.pop().unwrap();
    //if lhr_dup[0] > rmax { break }       // exit if product of consecutive r’s > n-2
    if last > rmax { break }
    if r < rmax { lhr_mults.push(r * r)} // for r^2 multiples
    //for ri in &lhr_dup {                 // for each residue in reduced list
    for ri in iter::once(last).chain(lhr_dup_inv.iter().rev().copied()) {                 // for each residue in reduced list
      if ri > rmax { break }            // exit for r if cross-product with ri > n-2
      lhr_mults.push(r * ri);            // store value if < n-2
    }
    //r = lhr_dup[0];
    r = last;
    //lhr_dup.remove(0);
  }

Basically, the creation of lhr_mults is implemented somewhat differently. I can probably be improved further if one takes the time, but I need to finish for now :wink: Ah and btw, I only checked that it gives the same answer in my single test case, so please check correctness, if that code snippet is of interest!

(ee) Ok one last thing, in honor of nerdsniping. Replacing all vec![] with Vec::with_capacity(n) also gives a slight boost, but to quantify that properly you'd have to set up real benchmarks. Flamegraph now shows the part of the runtime we can influence is dominated by the binary search and the coprime check. Not sure, maybe the gcd is well optimized in other languagues and your own rust implementation falls short? No idea, just spitballing. Also, an unholy amount of runtime is taken by lld-linux-x86-64.so.2, so one might want to check a fully statically linked target, but this will diminish when the chosen argument is bigger...

1 Like

Taking all the suggestions into account, here's the fastest version I've been able to do.
If hope someone can do one with a different HashSet to see any speed improves.

Can someone show me how to take inputs with underscores, and from the cli like:
$ ./prime_pairs_lo 123_456_780

use std::time::Instant;
use std::collections::HashSet;

fn coprime(mut m: usize, mut n: usize) -> bool {
  while m|1 != 1 { let t = m; m = n % m; n = t }
  m > 0
}

fn prime_pairs_lohi(n : usize) {
  if n&1 == 1 || n < 4 { return println!("Input not even n > 2"); }
  if n <= 6 { return println!("[{}, {}]\n[{}, {}]\n[{}, {}]",n,1,n/2,n/2,n/2,n/2); };

  // generate the low-half-residues (lhr) r < n/2
  let (ndiv2, rhi) = (n/2, n-2);          // lhr:hhr midpoint, max residue limit
  let mut lhr: Vec<usize> = (3..ndiv2).step_by(2).filter(|&r| coprime(r, n)).collect();

  // identify and store all powers and cross-products of the lhr members < n-2
  let lesser = |r: usize| { if r > ndiv2 { n - r } else { r } };
  let mut lhr_del = HashSet::<usize>::new(); // lhr multiples, not part of a pcp
  for i in 1..lhr.len()-1 {                  // iterate thru lhr to find prime multiples
    let (mut j, r) = (i, lhr[i-1]);          // for current residue
    let rmax = rhi / r;                      // ri can't multiply r with values > this
    if r < rmax { lhr_del.insert(lesser(r * r));} // for r^2 multiples
    if lhr[i] > rmax { break }               // exit if product of consecutive r’s > n-2
    while lhr[j] <= rmax {                   // stop for r if cross-product with ri > n-2
      lhr_del.insert(lesser(r * lhr[j]));    // store value if < n-2
      j += 1;                                // get next lhr value
  } }

  // remove from lhr its lhr_mults, convert vals > n/2 to lhr complements first
  lhr.retain(|&m| !lhr_del.contains(&m));

  let lcnt = lhr.len();
  println!("[{}, {}]", n, lcnt);                    // show n and pcp prime pairs count
  println!("[{}, {}]", lhr[0],n-lhr[0]);            // show first pcp prime pair of n
  println!("[{}, {}]", lhr[lcnt-1], n-lhr[lcnt-1]); // show last  pcp prime pair of n
}

fn main() {
  let mut val = String::new();
  std::io::stdin().read_line(&mut val).expect("Failed to read line");
  let n: usize = val.trim().parse().unwrap();

  let start = Instant::now();
  prime_pairs_lohi(n);
  println!("{:?}", start.elapsed());
}

You can get an arguments iterator with std::env::args(), picking out the first argument (after the binary name) with .nth(1), then .replace('_', "") should work.

All together:

let n: usize = std::env::args()
    .nth(1)
    .expect("missing count argument")
    .replace('_', "")
    .parse()
    .expect("invalid count argument");

Note that I was still able to about halve the runtime by making lhr_del a Vec with capacity n, using lhr_del.sort_unstable() and then using binary search in retain (as shown above), this time using 123456780 as input.

I was able to eek out 10% by switching usize to u32, not sure if that's a valid optimization for your.

Question though: Is lcnt itself a significant output? Or are you just printing it for information purposes? I was able to get a speedup by not actually running the retain loop, which would leave the 2nd and 3rd lines printed correct, but not the 1st. Would that be viable?

It seems that other people have had the idea of using the sieve of Eratosthenes to find primes. These crates might be helpful (I haven't looked at them):

Anyway, according to Wikipedia, the Golbach conjecture is verified up to 4x10^18, which is more or less 2^64. I'm fairly sure your program (the lhr vector) uses more or less 2^63 bytes of memory, which is a lot.

As a mathematician, I can't help myself, I'm too curious: What leads you to believe that there is a proof of the Goldbach conjecture that everyone has missed for hundreds of years, yet it takes 5 pages and doesn't require any mathematical knowledge from after 1852?

2 Likes

OK, I built a proper cargo project with the source code, and compiled with current 1.84.1 as:

$ RUSTFLAGS="-C opt-level=3 -C debuginfo=0 -C target-cpu=native" cargo build --release

and renamed the binaray as: prime_pairs_lohi_rs

Can now run as: $ ./prime_pairs_lohi_rs 123_456_780

use std::time::Instant;
use std::collections::HashSet;

fn coprime(mut m: usize, mut n: usize) -> bool {
  while m|1 != 1 { let t = m; m = n % m; n = t }
  m > 0
}

fn prime_pairs_lohi(n : usize) {
  if n&1 == 1 || n < 4 { return println!("Input not even n > 2"); }
  if n <= 6 { return println!("[{}, {}]\n[{}, {}]\n[{}, {}]",n,1,n/2,n/2,n/2,n/2); };

  // generate the low-half-residues (lhr) r < n/2
  let (ndiv2, rhi) = (n/2, n-2);          // lhr:hhr midpoint, max residue limit
  let mut lhr: Vec<usize> = (3..ndiv2).step_by(2).filter(|&r| coprime(r, n)).collect();

  // identify and store all powers and cross-products of the lhr members < n-2
  let lesser = |r: usize| { if r < ndiv2 { r } else { n - r } };
  let mut lhr_del = HashSet::<usize>::new(); // lhr multiples, not part of a pcp
  for i in 1..lhr.len()-1 {                  // iterate thru lhr to find prime multiples
    let (mut j, r) = (i, lhr[i-1]);          // for current residue
    let rmax = rhi / r;                      // ri can't multiply r with values > this
    if r < rmax { lhr_del.insert(lesser(r * r));} // for r^2 multiples
    if lhr[i] > rmax { break }               // exit if product of consecutive r’s > n-2
    while lhr[j] <= rmax {                   // stop for r if cross-product with ri > n-2
      lhr_del.insert(lesser(r * lhr[j]));    // store value if < n-2
      j += 1;                                // get next lhr value
  } }

  lhr.retain(|&m| !lhr_del.contains(&m));    // remove from lhr its lhr mults, pcp remain
  let lcnt = lhr.len();
  println!("[{}, {}]", n, lcnt);             // show n and pcp prime pairs count
  println!("[{}, {}]", lhr[0],n-lhr[0]);     // show first pcp prime pair of n
  println!("[{}, {}]", lhr[lcnt-1], n-lhr[lcnt-1]); // show last  pcp prime pair of n
}

fn main() {
  let n: usize = std::env::args()
    .nth(1).expect("missing count argument")
    .replace('_', "").parse().expect("one input");

  let start = Instant::now();
  prime_pairs_lohi(n);
  println!("{:?}", start.elapsed());
}

Here are times vs current D (ldc2 1.40.0), Julia (1.11.3), and Crystal (1.15.1).

➜  ~ ./prime_pairs_lohi_d 100_000_000
[100000000, 291400]
[11, 99999989]
[49999757, 50000243]
7 secs, 395 ms, 855 μs, and 8

➜  ~ julia prime_pairs_lohi.jl
100000000
[100000000, 291400]
[11, 99999989]
[49999757, 50000243]
  7.478992 seconds

➜  ~ ./prime_pairs_lohi_cr 100_000_000
[100000000, 291400]
[11, 99999989]
[49999757, 50000243]
00:00:10.084498936

➜  ~ ./prime_pairs_lohi_rs 100_000_000
[100000000, 291400]
[11, 99999989]
[49999757, 50000243]
14.822477246s
------------------------------------------

  ~ ./prime_pairs_lohi_d 150_000_000
[150000000, 835256]
[43, 149999957]
[74999917, 75000083]
5 secs, 318 ms, 123 μs, and 6 hnsecs

➜  ~ julia prime_pairs_lohi.jl
150000000
[150000000, 835256]
[43, 149999957]
[74999917, 75000083]
  6.770088 seconds

➜  ~ ./prime_pairs_lohi_cr 150_000_000
[150000000, 835256]
[43, 149999957]
[74999917, 75000083]
00:00:07.710278395

➜  ~ ./prime_pairs_lohi_rs 150_000_000
[150000000, 835256]
[43, 149999957]
[74999917, 75000083]
11.180492142s

------------------------------------------
➜  ~ ./prime_pairs_lohi_d 700_000_000
[700000000, 1979689]
[47, 699999953]
[349999691, 350000309]
44 secs, 909 ms, 592 μs, and 3 hnsecs


➜  ~ julia prime_pairs_lohi.jl
700000000
[700000000, 1979689]
[47, 699999953]
[349999691, 350000309]
 49.378869 seconds

➜  ~ ./prime_pairs_lohi_rs 700_000_000
[700000000, 1979689]
[47, 699999953]
[349999691, 350000309]
106.998070622s

What was really surprising to me was how fast Julia turned out after people in
its forum went to work on my code version (my first ever Julia code).

I'm really surprised that for some reason Rust is so much slower.
I still suspect the speed diff is the array removal code.
lhr.retain(|&m| !lhr_del.contains(&m));

D|Crystal use arrays for lhr_del, not a HashSet, which I guess could be the bottleneck.

Also, this is a reference implementation from my paper to implement the algorithm in it.
Using bitarrays could extend mem, but significantly slow speed. Pick your priority.
Could use u32 arrays to extend mem, and limit n size to 2^32 - 1, the most frequent values.
I made two D versions, one for u32 and one for u64 (if you have enough mem for those values).
One of my latptops has 64GB so I can do into 5/6 billion, depending on the factorization of n.

As for the math questions.
Please study the paper.
Don't need to do no sieves!! The residues are the primes. That's the point!

It seems peoples brains shutdown when they realize how simple things are using modular groups.
But the math is irrefutable, and noone has attempted to claim anything in the paper is
mathematically in error (which it isn't). I updated the paper again today (Feb 15, 2025) to
put in shorter|simpler|faster Ruby versions, so get it off my Academia.edu site.

So far I have D, Go, Ruby, Rust, Julia, Crystal, and Python versions.
If people are interested in seeing them I can post them as gist files (gist.github.com).

Would really like to have a trully fast Rust version.